### Remove Stop Words

In [None]:
def remove_stop_words(input_file, stop_words, output_file):
    out = open(output_file, 'w', encoding='utf-8')
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.split()
            filtered_sentence = [w.lower().strip() for w in words if not w in stop_words]
            out.write(" ".join(filtered_sentence) + "\n")
    out.close()

directory = "data/compiled/"
sw = read_stop_words("arabic_stopwords_compiled.txt")
print("Stop words: " + str(sw))

for century in list(range(100, 1600, 100)):
    corpus_name = directory + "corpus_" + str(century) + "AH"
    output_name = directory + "corpus_" + str(century) + "AH_nonstop"
    print("Starting on corpus ", corpus_name)
    remove_stop_words(corpus_name, sw, output_name)

### Apply Lemmatization

In [120]:
import stanfordnlp

nlp = stanfordnlp.Pipeline(lang='ar', processors="tokenize,lemma")
doc = nlp("إسماعيل")
for sentence in doc.sentences:
    lemmatized_sentence = [word.lemma for word in sentence.words]
lemmatized_sentence

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\Kenan\\stanfordnlp_resources\\ar_padt_models\\ar_padt_tokenizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': 'C:\\Users\\Kenan\\stanfordnlp_resources\\ar_padt_models\\ar_padt_lemmatizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---


['إسماعيل']

In [113]:
import stanfordnlp
    
def apply_lemmatization(corpus_name, output_file):
    nlp = stanfordnlp.Pipeline(lang='ar', processors="tokenize,lemma")
    out = open(output_file, 'w', encoding='utf-8')
    count = 0
    
    with open(corpus_name, "r", encoding="utf-8") as f:
        count = 0
        block = []
        for line in f:
            block.extend(line.strip().split())
            count += 1
            if count % 10 == 0:
                if len(block) > 1:
                    doc = nlp(" ".join(block))
                    for sentence in doc.sentences:
                        lemmatized_sentence = [word.lemma for word in sentence.words]
                    out.write(" ".join(lemmatized_sentence) + "\n")
                    block = []
             
            if count % 1000 == 0:
                print("Completed " + str(count) + " lines")
            
    out.close()
        

directory = 'data/compiled/'
century = 100
corpus_name = directory + "corpus_" + str(century) + "AH_nonstop"
output_name = directory + "corpus_" + str(century) + "AH_lemmatized"
apply_lemmatization(corpus_name, output_name)

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': 'C:\\Users\\Kenan\\stanfordnlp_resources\\ar_padt_models\\ar_padt_tokenizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': 'C:\\Users\\Kenan\\stanfordnlp_resources\\ar_padt_models\\ar_padt_lemmatizer.pt', 'lang': 'ar', 'shorthand': 'ar_padt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---
Completed 1000 lines
Completed 2000 lines
Completed 3000 lines
Completed 4000 lines
Completed 5000 lines
Completed 6000 lines
Completed 7000 lines
Completed 8000 lines
Completed 9000 lines
Completed 10000 lines
Completed 11000 lines
Completed 12000 lines
Completed 13000 lines
Completed 14000 lines
Completed 15000 lines
Completed 16000 lines
Completed 17000 lines
Completed 18000 lines
Completed 19000 l

### Test Training

In [None]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess


def read_corpus(filename):
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            yield simple_preprocess(line)

century=700
filename = "data/compiled/corpus_" + str(century) + "AH_lemmatized_clean"
out_file = "data/gensim/corpus_" + str(century) + "AH_lemmatized.model"
print("Starting on corpus: " + filename)
# sentences = list(read_corpus(filename))
model_700 = Word2Vec(sentences, size=300, window=8, min_count=10, workers=4)
# print(list(model.wv.vocab))
print("Top 3 closest to شتى ", model_700.wv.most_similar("شتى", topn=10))
# print("Top 3 closest to بَيت ", model.wv.most_similar("قَوم", topn=10))

### Lemmatize