In [None]:
import pandas as pd
import random as rd
import json
import numpy as np
import re

**Downloading the fasttext English model**

In [None]:
!pip install --upgrade --no-cache-dir gdown -q 


In [None]:
!gdown 1MfsihS5EK0nsfaWD6eeXW-dTQ1Ms0Z8N 

Downloading...
From: https://drive.google.com/uc?id=1MfsihS5EK0nsfaWD6eeXW-dTQ1Ms0Z8N
To: /content/cc.en.300.bin
100% 7.24G/7.24G [00:49<00:00, 146MB/s]


In [None]:
!pip install fasttext -q

In [None]:
import fasttext.util

In [None]:
english_fasttext_model = fasttext.load_model('/content/cc.en.300.bin')



In [None]:
english_fasttext_model.get_dimension()

300

In [None]:
input_table = pd.read_csv("/content/input_table (1).csv")

In [None]:
input_table.head(5)

Unnamed: 0.1,Unnamed: 0,input_text,vector,input_id
0,0,باید حقوق دیگران را رعایت کنیم,"[array([ 4.51785745e-03, 1.42124370e-01, -9.4...",0
1,1,ادب یکی از صفات اخلاقی نیکو است که انسان باید ...,"[array([-5.06621338e-02, 1.33330569e-01, -1.4...",1
2,2,ما باید از غیبت کردن پرهیز کنیم,"[array([ 0.06224397, 0.06898309, -0.06263755,...",2
3,3,باید خاک وطن را دوست داشت,"[array([ 0.12196983, 0.07435557, -0.24708495,...",3
4,4,نباید براساس ظاهر افراد را قضاوت کرد,"[array([ 3.34001556e-02, 1.89239368e-01, -1.7...",4


In [None]:
def create_vocabulary(corpus):
    vocabulary = {}
    i = 0
    for s in corpus:
        for w in s.split():
            if w not in vocabulary:
                vocabulary[w] = i
                i+=1
    return vocabulary

In [None]:
english_output = pd.read_csv("/content/quotes_normalized.csv")
english_output

Unnamed: 0.1,Unnamed: 0,output_id,output_text
0,0,3223,Be yourself; everyone else is already taken
1,1,3224,"I'm selfish, impatient and a little insecure. ..."
2,2,3225,Two things are infinite: the universe and huma...
3,3,3226,"So many books, so little time"
4,4,3227,A room without books is like a body without a ...
...,...,...,...
2036,2036,5723,"Stepping onto a brand-new path is difficult, b..."
2037,2037,5724,Morality is simply the attitude we adopt towar...
2038,2038,5726,"In life, finding a voice is speaking and livin..."
2039,2039,5727,"Winter is the time for comfort, for good food ..."


In [None]:
english_vocabulary = create_vocabulary(english_output['output_text'])

In [None]:
english_output['vector'] = np.ndarray

In [None]:
english_vocabulary

{'Be': 0,
 'yourself;': 1,
 'everyone': 2,
 'else': 3,
 'is': 4,
 'already': 5,
 'taken': 6,
 "I'm": 7,
 'selfish,': 8,
 'impatient': 9,
 'and': 10,
 'a': 11,
 'little': 12,
 'insecure.': 13,
 'I': 14,
 'make': 15,
 'mistakes,': 16,
 'am': 17,
 'out': 18,
 'of': 19,
 'control': 20,
 'at': 21,
 'times': 22,
 'hard': 23,
 'to': 24,
 'handle.': 25,
 'But': 26,
 'if': 27,
 'you': 28,
 "can't": 29,
 'handle': 30,
 'me': 31,
 'my': 32,
 'worst,': 33,
 'then': 34,
 'sure': 35,
 'as': 36,
 'hell': 37,
 "don't": 38,
 'deserve': 39,
 'best': 40,
 'Two': 41,
 'things': 42,
 'are': 43,
 'infinite:': 44,
 'the': 45,
 'universe': 46,
 'human': 47,
 'stupidity;': 48,
 'not': 49,
 'about': 50,
 'So': 51,
 'many': 52,
 'books,': 53,
 'so': 54,
 'time': 55,
 'A': 56,
 'room': 57,
 'without': 58,
 'books': 59,
 'like': 60,
 'body': 61,
 'soul': 62,
 'who': 63,
 'say': 64,
 'what': 65,
 'feel,': 66,
 'because': 67,
 'those': 68,
 'mind': 69,
 'matter,': 70,
 'matter': 71,
 "You've": 72,
 'gotta': 73,
 'da

**Calculate the TF-IDF weights**

In [None]:
# Calculate the TF-IDF weights
from sklearn.feature_extraction.text import TfidfVectorizer
def calculate_tf_idf_weights(text , costum_vocabulary):  
  vectorizer = TfidfVectorizer(vocabulary = costum_vocabulary)
  text_tfidf = vectorizer.fit_transform(text)
  return text_tfidf

In [None]:
english_output_tfidf = calculate_tf_idf_weights(english_output['output_text'] , english_vocabulary)



**Calculating the embedding of the English output**

In [None]:
for sentence_idx in range(len(english_output['output_text'])):
  sum = 0
  for word in english_output['output_text'][sentence_idx].split():
    word_index = english_vocabulary[word]
    tfidf = english_output_tfidf[sentence_idx, word_index]
    w = english_fasttext_model.get_word_vector(word)
    sum += w * tfidf
  english_output['vector'][sentence_idx] = sum

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_output['vector'][sentence_idx] = sum


In [None]:
english_output

Unnamed: 0.1,Unnamed: 0,output_id,output_text,vector
0,0,3223,Be yourself; everyone else is already taken,"[0.021507587, 0.10527147, 0.031182537, 0.17292..."
1,1,3224,"I'm selfish, impatient and a little insecure. ...","[0.43437356, -0.012523319, 0.11872289, -0.0508..."
2,2,3225,Two things are infinite: the universe and huma...,"[-0.087947115, -0.034901757, 0.08202586, 0.232..."
3,3,3226,"So many books, so little time","[0.026140692, -0.2085052, -0.0907764, -0.04987..."
4,4,3227,A room without books is like a body without a ...,"[-0.056629535, -0.029509554, -0.027170897, 0.2..."
...,...,...,...,...
2036,2036,5723,"Stepping onto a brand-new path is difficult, b...","[-0.12285137, -0.10083759, -0.022194745, 0.283..."
2037,2037,5724,Morality is simply the attitude we adopt towar...,"[-0.17059729, -0.051743813, 0.07274948, 0.0740..."
2038,2038,5726,"In life, finding a voice is speaking and livin...","[0.0061662, -0.18240187, 0.1266683, 0.23183423..."
2039,2039,5727,"Winter is the time for comfort, for good food ...","[0.04502034, -0.14565398, -0.11081443, 0.30678..."


In [None]:
type(english_output["vector"][0])

numpy.ndarray

In [None]:
indexes = []
for i in range(len(english_output)):
  if (english_output.iloc[i]["vector"] @ english_output.iloc[i]["vector"].T == 0):
    print(english_output.iloc[i]["vector"])
    print(str(i))
    print(english_output.iloc[i]["output_text"])
    indexes.append(i)


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
857
Ø£Ø¬Ù…Ù„ Ø­Ø¨ Ù‡Ùˆ Ø§Ù„Ø°ÙŠ Ù†Ø¹Ø«Ø± Ø¹Ù„ÙŠÙ‡ Ø£Ø«Ù†Ø§Ø¡ Ø¨Ø­Ø«Ù†Ø§ Ø¹Ù† Ø´ÙŠØ¡ Ø¢

In [None]:
indexes

[857, 979, 1209, 1373]

In [None]:
for index in indexes:
    english_output.drop(index, axis=0, inplace=True)

In [None]:
for i in range(len(english_output)):
  if (english_output.iloc[i]["vector"] @ english_output.iloc[i]["vector"].T == 0):
    print(english_output.iloc[i]["vector"])
    print(str(i))
    print(english_output.iloc[i]["output_text"])

**Calculating the pairs with KNN Algorithm**

In [None]:
def create_pair_knn(k,model_name,input_table,output_table,isChaptered,chapter_number):
  if isChaptered: 
    columns = ["model","input_id","input_text","output_id","output_text","k","chapter_number","input_vector","output_vector"]
  else:
    columns = ["model","input_id","input_text","output_id","output_text","k","input_vector","output_vector"]
  pair_table = pd.DataFrame([], columns=columns)
  for i in range(len(input_table)):
    vector = []
    for j in range(len(output_table)):
      input_vector = input_table.iloc[i]['vector']
      output_vector = output_table.iloc[j]['vector']
      tmp3 = input_vector
      english = r"([ ]*[\n][ ]+)|([ ]+)"
      english_reg = f"{english}"
      space = r"[ ]+"
      tmp3 = tmp3.split("[")[2]
      tmp3 = tmp3.split("]")[0] 
      tmp3 = tmp3.strip()
      tmp = np.float32(tmp3.split(','))
      input_vector = tmp
      sim = input_vector @ output_vector.T 
      input_vector_sqrt = np.sqrt(input_vector @ input_vector.T) 
      output_vector_sqrt = np.sqrt(output_vector @ output_vector.T)
      vector.append(sim / (input_vector_sqrt * output_vector_sqrt))
    vector = np.array(vector)
    indexes = vector.argsort()[-k:]
    p = 1
    for index in indexes:
      if isChaptered: 
        pair_table = pair_table.append({'model':model_name,"input_id":input_table.iloc[i]['input_id'],"input_text":input_table.iloc[i]['input_text'],"output_id":output_table.iloc[index]['output_id'] ,"output_text":output_table.iloc[index]['output_text'] ,"k":p,"chapter_number":output_table.iloc[index][chapter_number],"input_vector":input_table.iloc[i]['vector'], "output_vector":output_table.iloc[index]['vector']}, ignore_index = True)
      else:
        pair_table = pair_table.append({'model':model_name,"input_id":input_table.iloc[i]['input_id'],"input_text":input_table.iloc[i]['input_text'],"output_id":output_table.iloc[index]['output_id'] ,"output_text":output_table.iloc[index]['output_text'] ,"k":p,"input_vector":input_table.iloc[i]['vector'], "output_vector":output_table.iloc[index]['vector']}, ignore_index = True)
      p+=1
  return pair_table

In [None]:
fasttext_english_pair = create_pair_knn(2,"FastText",input_table,english_output, False,"none")

In [None]:
fasttext_english_pair

Unnamed: 0,model,input_id,input_text,output_id,output_text,k,input_vector,output_vector
0,FastText,0,باید حقوق دیگران را رعایت کنیم,3457,Be careful about reading health books. Some fi...,1,"[array([ 4.51785745e-03, 1.42124370e-01, -9.4...","[-0.054171547, 0.011109498, 0.00291707, 0.1554..."
1,FastText,0,باید حقوق دیگران را رعایت کنیم,3808,"For you, a thousand times ove",2,"[array([ 4.51785745e-03, 1.42124370e-01, -9.4...","[0.0025119493, -0.041613355, -0.054948397, 0.0..."
2,FastText,1,ادب یکی از صفات اخلاقی نیکو است که انسان باید ...,3509,I am not pretty. I am not beautiful. I am as r...,1,"[array([-5.06621338e-02, 1.33330569e-01, -1.4...","[0.008606222, -0.2919548, 0.08382909, -0.16086..."
3,FastText,1,ادب یکی از صفات اخلاقی نیکو است که انسان باید ...,4870,Quotation is a serviceable substitute for wit,2,"[array([-5.06621338e-02, 1.33330569e-01, -1.4...","[-0.0062647015, -0.1391012, -0.040977415, 0.04..."
4,FastText,2,ما باید از غیبت کردن پرهیز کنیم,4214,Even the darkest night will end and the sun wi...,1,"[array([ 0.06224397, 0.06898309, -0.06263755,...","[-0.21394153, 0.18217742, -0.04699292, 0.38874..."
...,...,...,...,...,...,...,...,...
95,FastText,47,هرکاری را می‌توان انجام داد به جز تغییر عادت‌ها,4954,"Give her hell from us, Peeves",2,"[array([ 2.06727982e-01, 1.37236342e-01, 5.7...","[-0.04417131, 0.012084901, -0.007480271, 0.130..."
96,FastText,48,باید از منافق دوری کنیم,5522,Sometimes you lose a battle. But mischief alwa...,1,"[array([ 0.04293227, 0.15004778, -0.16598155,...","[-0.12148016, -0.09852682, 0.07275429, 0.17304..."
97,FastText,48,باید از منافق دوری کنیم,4001,"Wow,"" Thalia muttered. ""Apollo is hot."" ""He's ...",2,"[array([ 0.04293227, 0.15004778, -0.16598155,...","[-0.034163248, 0.08340598, -0.03514251, 0.1613..."
98,FastText,49,لزوما رسیدن به چیزی که آرزو داریم به نفع ما نیست,4214,Even the darkest night will end and the sun wi...,1,"[array([-2.19862955e-03, 3.34896706e-02, 7.2...","[-0.21394153, 0.18217742, -0.04699292, 0.38874..."


**Saving the data into a csv file**

In [None]:
pd.DataFrame(fasttext_english_pair).to_csv("/content/quotes_pair.csv")