# Import

In [None]:
import nltk
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
import os

# Tokenization

* Tokenization of raw text is a standard pre-processing step for many NLP tasks.

* For English, tokenization usually involves punctuation splitting and separation of some affixes like possessives.

* Other languages require more extensive token pre-processing, which is usually called segmentation.

* The Stanford Word Segmenter currently supports Arabic and Chinese. (The Stanford Tokenizer can be used for English, French, and Spanish.) The provided segmentation schemes have been found to work well for a variety of applications.

## Requirements

* The system requires Java 1.8+ to be installed.
* stanford-segmenter can be found here https://nlp.stanford.edu/software/segmenter.html
* NTLK

# Arabic Stanford Word Segmenter paper
* http://nlp.stanford.edu/pubs/monroe-green-manning-acls2014.pdf

# Set paths to files

In [None]:
java_path = "C:\\Program Files\\Java\\jdk1.8.0_131\\bin\\java.exe"
slf4j_path ='C:\\stanford-segmenter\\slf4j-api.jar'
stanford_models_paths = 'C:\\stanford-segmenter\\data'
classpath = 'C:\\stanford-segmenter\\slf4j-api.jar;C:\\stanford-segmenter\\stanford-segmenter.jar'
nltk.internals.config_java(java_path)
os.environ['JAVAHOME'] = java_path
os.environ['SLF4J'] =slf4j_path
os.environ['STANFORD_MODELS'] =stanford_models_paths
os.environ['CLASSPATH'] = classpath

# Using the segmenter

In [None]:
tokenizer = StanfordSegmenter()
tokenizer.default_config('ar')

## Segmente a sentence  (list of words)

In [None]:
sentence = ['من!', 'أنت', 'وقالها']

In [None]:
sentence_tekonized = tokenizer.segment(sentence)
print(sentence_tekonized)

## Segmente a list of sentences  (list of list of words)

In [None]:
sentences = [['من!', 'أنت', 'وقالها'],
             ['من!', 'أنت', 'وقالها']]

In [None]:
sentences_tekonized = tokenizer.segment_sents(sentences)
print(sentences_tekonized)

## Segmente a text file

In [None]:
def split(string):
    return string.split()
def tokenize_file(input_file,output_file):
    with open(output_file, 'w',encoding='utf8') as new_file:
        with open(input_file, 'r',encoding='utf8') as f:
            content = f.readlines()
            content = [x.strip() for x in content] 
        print('Number of lines : ',len(content))
        lines = tokenizer.segment_sents(list(map(split, content))).splitlines()
        new_file.write('\n'.join(lines))
    print('Done ,see :',output_file)

In [None]:
tokenize_file(input_file='./texts.txt',output_file='./texts_tekonized.txt')

In [None]:
#http://nlp.stanford.edu:8080/parser/index.jsp