In [61]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
import numpy
import nltk, re, string, collections
from nltk.util import ngrams

In [62]:
def read_cleaned_data_set():
    file = open("output/tokens_no_stopwords_train.txt", encoding='utf-8')
    corpus =[]
    lines = file.readlines()
    for line in lines:
        result=line.rstrip('\n')
        if len(result)>0:
            result = result[1:len(result)-2]
            result=result.replace(",","")
            result= result.replace("'","")
        
            corpus.append(result)
    return corpus

In [63]:
corpus =read_cleaned_data_set()


In [86]:
def get_TF_IDF(docs): 
    #create tf-idf vector & enable idf to calculate them in one steps
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    #get values for docs
    tfIdf = tfIdfVectorizer.fit_transform(docs)
   
    # get first tf-idfs for first doc only to be visualized
    # get features names mean get words
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    #sorting them
    df = df.sort_values('TF-IDF', ascending=False)
    dict_res = df.to_dict()
    dict_res= dict_res['TF-IDF']
   
    # print(df.head(15))
    return dict_res

    #show first 15 of them 

    


In [88]:
dict_res=get_TF_IDF(corpus)
# print(dict_res)
# print(tf_idf.vocabulary_)



In [89]:
def assign_tf_idf(corpus,dict_res):
    td_idf_sentences=[]
    for text in corpus:
        text= text.split()
        tf_idf= []
        for value in text:
            tf_idf.append(dict_res[value])
        td_idf_sentences.append(tf_idf)
    return td_idf_sentences

In [91]:
td_idf_sentences=assign_tf_idf(corpus,dict_res)
print(td_idf_sentences[0])

[0.1957229567998964, 0.20336576437522846, 0.09062342317799071, 0.03455527835278504, 0.21898140679919656, 0.2100423980628473, 0.2433243687609579, 0.18215192710826753, 0.22490962891549338, 0.2433243687609579, 0.2433243687609579, 0.19803457533604235, 0.22490962891549338, 0.153411767248694, 0.08285380460363088, 0.19162765821738278, 0.2433243687609579, 0.1880801492245643, 0.12206399741803829, 0.13886813179573265, 0.12363139771470806, 0.23255243649082546, 0.14599506192196865, 0.16198275746099583, 0.20649488907002883, 0.1705088724072879, 0.16198275746099583, 0.1705088724072879, 0.23255243649082546]


In [None]:
def get_tokens_freq(corpus):
    # corpus_new = corpus.split()
    tokens ={}
    for text in corpus:
        text = text.split()
        for value in text:
            if value in tokens :
                tokens[value]+=1
            else:
                tokens[value]=1
    return tokens

In [None]:
def get_n_grams(corpus, n_gram):
    all_n_grams=[]
    for text in corpus:
        padding = ["باد"]*(n_gram-1)
        text = text.split()
        text = padding+ text+padding
        n_grams = ngrams(text, n_gram)
        all_n_grams+=n_grams
    ngramFreq = collections.Counter(all_n_grams)
    return ngramFreq


In [None]:
def convert_ngrams_probablities( n,corpus):
  probablities =[]
  ngrams= get_n_grams(corpus,n)
  
  # print("kdkdk",ngrams)
  if n>1:
    n_minus_one_grams = get_n_grams(corpus, n-1)
  tokens =get_tokens_freq(corpus)
  for text in corpus:
    padding = ["باد"]*(n-1)
    text_value=text.split()
    text_value= padding+text_value+padding
    prob_list= []
    for index in range(n, len(text_value)-n) :
      n_gram_text = tuple(text_value[index-n:index ])
      n_minus_one_grams_text = tuple(text_value[index-n:index-1])
      #here to add one smooth padding
      # nomiantor=1
      # dominator=len(tokens) 
      nomiantor=0.0
      dominator=0.0
      if n_gram_text in ngrams:
  
        nomiantor +=ngrams[n_gram_text]
      if n_minus_one_grams_text in n_minus_one_grams:
  
        dominator+= n_minus_one_grams[n_minus_one_grams_text]

      elif n==1:
        dominator += tokens[n_minus_one_grams_text]
      prob=nomiantor /dominator
      prob_list.append(prob) # for one doc
    probablities.append(prob_list) # for all docs
    
  return probablities
    

In [None]:
prob=convert_ngrams_probablities(3,corpus)
result=get_n_grams(corpus,2)
# print(result)
# # result.
result.most_common(10)


In [None]:
print((prob[0]))

In [None]:
print(prob[100])

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec

In [None]:
def get_word_embeddings(corpus, min_count=1, vector_size=100, window=5, sg = 1):
	data = []
	for i in range(len(corpus)):
		temp = []
		for j in corpus[i].split():
			temp.append(j)
		data.append(temp)
	# IMP Note: If the lines below result in an error, change size -> vector_size. It's a version variable naming issue.
	# Create CBOW model
	model1 = gensim.models.Word2Vec(data, min_count=min_count, vector_size =vector_size, window=window)
	# Create Skip Gram model
	model2 = gensim.models.Word2Vec(data, min_count=min_count, vector_size =vector_size, window=window, sg=sg)
	return model1, model2

In [None]:
cbowModel, sgModel = get_word_embeddings(corpus)

In [None]:
# For CBOW
print("Cosine similarity between 'لقاح' " + "and 'كورونا' - CBOW : ", cbowModel.wv.similarity('لقاح', 'كورونا'))
print("Cosine similarity between 'لقاح' " + "and 'صيفي' - CBOW : ", cbowModel.wv.similarity('لقاح', 'صيفي'))
# For Skip Gram
print("Cosine similarity between 'لقاح' " + "and 'كورونا' - Skip Gram : ",	sgModel.wv.similarity('لقاح', 'كورونا'))
print("Cosine similarity between 'لقاح' " + "and 'صيفي' - Skip Gram : ", sgModel.wv.similarity('لقاح', 'صيفي'))