In [49]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer 
import numpy
import nltk, re, string, collections
from nltk.util import ngrams

In [50]:
def read_cleaned_data_set():
    file = open("output/tokens_no_stopwords_train.txt", encoding='utf-8')
    corpus =[]
    lines = file.readlines()
    for line in lines:
        result=line.rstrip('\n')
        if len(result)>0:
            result = result[1:len(result)-2]
            result=result.replace(",","")
            result= result.replace("'","")
        
            corpus.append(result)
    return corpus

In [51]:
corpus =read_cleaned_data_set()


In [52]:
def get_TF_IDF(docs): 
    #create tf-idf vector & enable idf to calculate them in one steps
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    #get values for docs
    tfIdf = tfIdfVectorizer.fit_transform(docs)
    # get first tf-idfs for first doc only to be visualized
    # get features names mean get words
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    #sorting them
    df = df.sort_values('TF-IDF', ascending=False)
    #show first 15 of them 
    print (df.head(15))
    


In [53]:
get_TF_IDF(corpus)

            TF-IDF
صيفي      0.243324
السيرنجة  0.243324
مزايا     0.243324
بولو      0.243324
اصغر      0.232552
مؤهل      0.232552
الشتاء    0.224910
لابس      0.224910
تصوير     0.218981
الابرة    0.210042
يعنى      0.206495
غيتس      0.203366
عز        0.198035
بيل       0.195723
احدى      0.191628




In [54]:
def get_tokens_freq(corpus):
    # corpus_new = corpus.split()
    tokens ={}
    for text in corpus:
        text = text.split()
        for value in text:
            if value in tokens :
                tokens[value]+=1
            else:
                tokens[value]=1
    return tokens

In [55]:
def get_n_grams(corpus, n_gram):
    all_n_grams=[]
    tokens ={}
    for text in corpus:
        padding = ["باد"]*n_gram
        text = text.split()
        text = padding+ text+padding
        n_grams = ngrams(text, n_gram)
        all_n_grams+=n_grams
    ngramFreq = collections.Counter(all_n_grams)
    return ngramFreq,tokens


In [56]:
def convert_ngrams_probablities( n,corpus):
  probablities =[]
  ngrams= get_n_grams(corpus,n)
  if n>1:
    n_minus_one_grams = get_n_grams(corpus, n-1)
  tokens =get_tokens_freq(corpus)
  for text in corpus:
    padding = ["باد"]*n
    text_value=text.split()
    text_value= padding+text_value+padding
    prob_list= []
    for index in range(n, len(text_value)-n) :
      n_gram_text = tuple(text_value[index-n:index ])
      n_minus_one_grams_text = tuple(text_value[index-n:index-1])
      nomiantor=1
      dominator=len(tokens)
      if n_gram_text in ngrams:
        nomiantor +=ngrams[n_gram_text]
      if n_minus_one_grams_text in n_minus_one_grams:
        dominator+= n_minus_one_grams[n_minus_one_grams_text]
      elif n==1:
        dominator += tokens[n_minus_one_grams_text]
      prob=nomiantor /dominator
      prob_list.append(prob) # for one doc
    probablities.append(prob_list) # for all docs
  return probablities
    

In [57]:
prob=convert_ngrams_probablities(3,corpus)
result, tokens=get_n_grams(corpus[0],2)
# result.
result.most_common(10)


[(('باد', 'باد'), 338),
 (('باد', 'ا'), 23),
 (('ا', 'باد'), 23),
 (('باد', 'ل'), 19),
 (('ل', 'باد'), 19),
 (('باد', 'ي'), 11),
 (('ي', 'باد'), 11),
 (('باد', 'ح'), 6),
 (('ح', 'باد'), 6),
 (('باد', 'و'), 6)]

In [58]:
print(len(prob))

6988


In [59]:
print(len(corpus))

6988
