In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
import numpy
import nltk, re, string, collections
from nltk.util import ngrams

In [3]:
def read_cleaned_data_set():
    file = open("output/tokens_no_stopwords_train.txt", encoding='utf-8')
    corpus =[]
    lines = file.readlines()
    for line in lines:
        result=line.rstrip('\n')
        if len(result)>0:
            result = result[1:len(result)-2]
            result=result.replace(",","")
            result= result.replace("'","")
        
            corpus.append(result)
    return corpus

In [4]:
corpus =read_cleaned_data_set()

In [5]:
def get_TF_IDF(docs): 
    #create tf-idf vector & enable idf to calculate them in one steps
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    #get values for docs
    tfIdf = tfIdfVectorizer.fit_transform(docs)
    # get first tf-idfs for first doc only to be visualized
    # get features names mean get words
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    #sorting them
    df = df.sort_values('TF-IDF', ascending=False)
    #show first 15 of them 
    print (df.head(15))
    


In [6]:
get_TF_IDF(corpus)

            TF-IDF
صيفي      0.243324
السيرنجة  0.243324
مزايا     0.243324
بولو      0.243324
اصغر      0.232552
مؤهل      0.232552
الشتاء    0.224910
لابس      0.224910
تصوير     0.218981
الابرة    0.210042
يعنى      0.206495
غيتس      0.203366
عز        0.198035
بيل       0.195723
احدى      0.191628


In [7]:
def get_n_grams(corpus, n_gram):
    all_n_grams=[]
    for text in corpus:
        text = text.split()
       
        n_grams = ngrams(text, n_gram)
      
        all_n_grams+=n_grams
    ngramFreq = collections.Counter(all_n_grams)
    return ngramFreq


In [8]:
result=get_n_grams(corpus,2)
result.most_common(10)

[(('لقاح', 'كورونا'), 1369),
 (('الجرعة', 'الاولى'), 665),
 (('الاولى', 'لقاح'), 574),
 (('يتلقى', 'الجرعة'), 424),
 (('جرعة', 'لقاح'), 400),
 (('لقاح', 'فايزر'), 333),
 (('فيروس', 'كورونا'), 313),
 (('الجرعة', 'الثانية'), 308),
 (('لقاح', 'كوفيد'), 239),
 (('الثانية', 'لقاح'), 234)]

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [21]:
def get_word_embeddings(corpus, min_count=1, vector_size=100, window=5, sg = 1):
	data = []
	for i in range(len(corpus)):
		temp = []
		for j in corpus[i].split():
			temp.append(j)
		data.append(temp)
	# IMP Note: If the lines below result in an error, change size -> vector_size. It's a version variable naming issue.
	# Create CBOW model
	model1 = gensim.models.Word2Vec(data, min_count=min_count, size=vector_size, window=window)
	# Create Skip Gram model
	model2 = gensim.models.Word2Vec(data, min_count=min_count, size=vector_size, window=window, sg=sg)
	return model1, model2

In [24]:
cbowModel, sgModel = get_word_embeddings(corpus)

In [25]:
# For CBOW
print("Cosine similarity between 'لقاح' " + "and 'كورونا' - CBOW : ", cbowModel.wv.similarity('لقاح', 'كورونا'))
print("Cosine similarity between 'لقاح' " + "and 'صيفي' - CBOW : ", cbowModel.wv.similarity('لقاح', 'صيفي'))
# For Skip Gram
print("Cosine similarity between 'لقاح' " + "and 'كورونا' - Skip Gram : ",	sgModel.wv.similarity('لقاح', 'كورونا'))
print("Cosine similarity between 'لقاح' " + "and 'صيفي' - Skip Gram : ", sgModel.wv.similarity('لقاح', 'صيفي'))

Cosine similarity between 'لقاح' and 'كورونا' - CBOW :  0.9993876
Cosine similarity between 'لقاح' and 'صيفي' - CBOW :  0.8644289
Cosine similarity between 'لقاح' and 'كورونا' - Skip Gram :  0.8749478
Cosine similarity between 'لقاح' and 'صيفي' - Skip Gram :  0.7042955
