In [None]:
#create doc embeddings with fasttext by averaging word embeddings
#but with a higher weight to the words that make up the title in B

#intuition:
#if I find in A the words that make up the title in B, then there's
#a good chance that B is a prerequisite of A

#examples of intuition:
#the wikipedia page of "Light" is a prerequisite of the wikipedia page of "Total internal reflection"
#the wikipedia page talking about "Total internal reflection" mentions the word "light" many times
#"Magnet" is a prerequisite of "Magnetic field" 
#the word "magnet" appears various times in the "Magnetic field" page

#the weight given to the word depends on the length of the document A
#the bigger the document the bigger the weight
#this is to prevent small documents from being too dependant on the word
#and for the word to be irrelevant in big documents

word_percent = 0.1

#create train dataset

ft_train = {k: [] for k in range(600)}
ft_train['prerequisite'] = []
for index, row in train.iterrows():
    A = doc_dict[row[0]]
    B_title = wiki_wiki.page(row[1]).langlinks['en'].title
    B = doc_dict[row[1]]
    doc_embedding_A = np.zeros(300)
    doc_embedding_B = np.zeros(300)

    #clean the title
    B_title = B_title.replace("'"," ")
    #tokenized
    B_title_tokenized = nltk.tokenize.word_tokenize(B_title, "italian")
    #no punctuation and lowercase
    B_title_tokenized_no_punct = [x.lower() for x in B_title_tokenized if x not in punct]
    #remove stop words
    B_title = [x for x in B_title_tokenized_no_punct if x not in en_stop_words]

    for word in A:
      weight = 1
      if word in B_title:
        #weight = 1
        weight = len(A) * word_percent
      word_embedding = weight * np.array(ft.get_word_vector(word))
      doc_embedding_A = doc_embedding_A + word_embedding

    for word in B:
      weight = 1
        #weight = len(B) * word_percent
      word_embedding = weight * np.array(ft.get_word_vector(word))
      doc_embedding_B = doc_embedding_B + word_embedding

    data = np.concatenate([doc_embedding_A,doc_embedding_B]) 
    for i,val in enumerate(data):
      ft_train[i].append(val)
    ft_train['prerequisite'].append(row[2])

#create validation dataset

ft_validation = {k: [] for k in range(600)}
ft_validation['prerequisite'] = []
for index, row in validation.iterrows():
    A = doc_dict[row[0]]
    B_title = wiki_wiki.page(row[1]).langlinks['en'].title
    B = doc_dict[row[1]]
    doc_embedding_A = np.zeros(300)
    doc_embedding_B = np.zeros(300)

    #clean the title
    B_title = B_title.replace("'"," ")
    #tokenized
    B_title_tokenized = nltk.tokenize.word_tokenize(B_title, "italian")
    #no punctuation and lowercase
    B_title_tokenized_no_punct = [x.lower() for x in B_title_tokenized if x not in punct]
    #remove stop words
    B_title = [x for x in B_title_tokenized_no_punct if x not in en_stop_words]

    for word in A:
      weight = 1
      if word in B_title:
        #weight = 1
        weight = len(A) * word_percent
      word_embedding = weight * np.array(ft.get_word_vector(word))
      doc_embedding_A = doc_embedding_A + word_embedding

    for word in B:
      weight = 1
      word_embedding = weight * np.array(ft.get_word_vector(word))
      doc_embedding_B = doc_embedding_B + word_embedding  

    data = np.concatenate([doc_embedding_A,doc_embedding_B]) 
    for i,val in enumerate(data):
      ft_validation[i].append(val)
    ft_validation['prerequisite'].append(row[2])

ft_df_train = pd.DataFrame(data = ft_train)
ft_df_validation = pd.DataFrame(data = ft_validation)

y_train = ft_df_train.iloc[:,600]
X_train = ft_df_train.iloc[:,:600]

X_test = ft_df_validation.iloc[:,:600]
y_test = ft_df_validation.iloc[:,600]