In [None]:
#imports
!pip install wikipedia
!pip install fasttext
!pip install gensim
!pip install nltk

import string
import pandas as pd
import wikipedia
import fasttext
import fasttext.util
from pprint import pprint as print
from gensim.models.fasttext import FastText as fasttext
from gensim.test.utils import datapath
from google.colab import drive
import xml.etree.ElementTree as ET
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
import fasttext.util
import sklearn as sk
from sklearn import svm
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [None]:
#initial setup
nltk.download('punkt')
nltk.download('stopwords')

fasttext.util.download_model('it', if_exists='ignore') 
ft = fasttext.load_model('cc.it.300.bin')

wikipedia.set_lang("it")

drive.mount('/content/drive')


In [None]:
#dataset init

#dataset consisting of pairs of concepts and a subject (physics, geomery, data_mining or precalcus) tag
#and consisting of pairs of target and prerequisite concepts (A, B) labelled as follows:
#1 if B is a prerequisite of A;
#0 in all other cases.

df = pd.read_csv('/content/drive/My Drive/m_dataset.csv')

#The Wikipedia page of each concept found in the previous file.
#Each Wikipedia page is introduced by a `<doc>` element (with *id* and *url*) 
#containing the title and the text of the corresponding page.
tree = ET.parse('/content/drive/My Drive/dataset.xml')
pages = tree.getroot()

In [6]:
#create dictionary of doc embeddings
#doc embedding is the averaged word embeddings

punct = string.punctuation + '«``»' + "''"
it_stop_words = nltk.corpus.stopwords.words('italian')
stemmer = nltk.stem.snowball.ItalianStemmer(True)

doc_dict = {}

for content in pages.iter('doc'):
  #document to lowercase, replaced apostrophe with space since the tokenizer isn't able to split words like "l'addizione"
  document = (content.find('title').text  + content.find('text').text).replace("'"," ")
  #tokenized
  tokenized = nltk.tokenize.word_tokenize(document, "italian")
  #no punctuation and lowercase
  no_punct = [x.lower() for x in tokenized if x not in punct]
  #remove stop words
  no_stop_words = [x for x in no_punct if x not in it_stop_words]
  #doc embedding
  doc_embedding = np.zeros(300)
  for word in no_stop_words:
      word_embedding = np.array(ft.get_word_vector(word))
      doc_embedding = doc_embedding + word_embedding
  doc_embedding = doc_embedding / len(no_stop_words)    

  
  #add tokenized document
  doc_dict[content.find('title').text] = doc_embedding

In [None]:
#create doc embeddings with fasttext by averaging word embeddings

#create train dataset

ft_train = {k: [] for k in range(600)}
ft_train['prerequisite'] = []
for index, row in train.iterrows():
    A = doc_dict[row[0]]
    B = doc_dict[row[1]]
    doc_embedding_A = np.zeros(300)
    doc_embedding_B = np.zeros(300)

    for word in A:
      word_embedding = np.array(ft.get_word_vector(word))
      doc_embedding_A = doc_embedding_A + word_embedding
    for word in B:
      word_embedding = np.array(ft.get_word_vector(word))
      doc_embedding_B = doc_embedding_B + word_embedding  
    
    #averaged doc_embeddings
    doc_embedding_A = doc_embedding_A / len(A)
    doc_embedding_B = doc_embedding_B / len(B)

    data = np.concatenate([doc_embedding_A, doc_embedding_B]) 
    for i,val in enumerate(data):
      ft_train[i].append(val)
    ft_train['prerequisite'].append(row[2])

#create validation dataset

ft_validation = {k: [] for k in range(600)}
ft_validation['prerequisite'] = []
for index, row in validation.iterrows():
    A = doc_dict[row[0]]
    B = doc_dict[row[1]]
    doc_embedding_A = np.zeros(300)
    doc_embedding_B = np.zeros(300)

    for word in A:
      word_embedding = np.array(ft.get_word_vector(word))
      doc_embedding_A = doc_embedding_A + word_embedding
    for word in B:
      word_embedding = np.array(ft.get_word_vector(word))
      doc_embedding_B = doc_embedding_B + word_embedding  
    
    #averaged doc_embeddings
    doc_embedding_A = doc_embedding_A / len(A)
    doc_embedding_B = doc_embedding_B / len(B)

    data = np.concatenate([doc_embedding_A, doc_embedding_B]) 
    for i,val in enumerate(data):
      ft_validation[i].append(val)
    ft_validation['prerequisite'].append(row[2])

ft_df_train = pd.DataFrame(data = ft_train)
ft_df_validation = pd.DataFrame(data = ft_validation)

y_train = ft_df_train.iloc[:,600]
X_train = ft_df_train.iloc[:,:600]

X_test = ft_df_validation.iloc[:,:600]
y_test = ft_df_validation.iloc[:,600]

In [None]:
#datasets for each subject

subject_df_dict = {}
subject_df_dict["physics"] = {k: [] for k in range(600)}
subject_df_dict["physics"]['prerequisite'] = []
subject_df_dict["geometry"] = {k: [] for k in range(600)}
subject_df_dict["geometry"]['prerequisite'] = []
subject_df_dict["data-mining"] = {k: [] for k in range(600)}
subject_df_dict["data-mining"]['prerequisite'] = []
subject_df_dict["precalculus"] = {k: [] for k in range(600)}
subject_df_dict["precalculus"]['prerequisite'] = []

df_embeddings = {k: [] for k in range(600)}
df_embeddings['prerequisite'] = []
df_embeddings['subject'] = []

for index,row in df.iterrows():
  data = np.concatenate([doc_dict[row[0]], doc_dict[row[1]]])
  is_prerequisite = row[2]
  subject = row[3]

  for i,val in enumerate(data):
    df_embeddings[i].append(val)
    subject_df_dict[subject][i].append(val)

  df_embeddings['prerequisite'].append(is_prerequisite)
  subject_df_dict[subject]['prerequisite'].append(is_prerequisite)
  df_embeddings['subject'].append(subject)

#split the various datasets into train and validation

df = pd.DataFrame(data = df_embeddings)
df_train = df.sample(frac=0.8,random_state=200) 
df_validation = df.drop(df_train.index)

physics_df = pd.DataFrame(data = subject_df_dict["physics"])
physics_train = physics_df.sample(frac=0.8,random_state=200) 
physics_validation = physics_df.drop(physics_train.index)

geometry_df = pd.DataFrame(data = subject_df_dict["geometry"])
geometry_train = geometry_df.sample(frac=0.8,random_state=200) 
geometry_validation = geometry_df.drop(geometry_train.index)

data_mining_df = pd.DataFrame(data = subject_df_dict["data-mining"])
data_mining_train = data_mining_df.sample(frac=0.8,random_state=200) 
data_mining_validation = data_mining_df.drop(data_mining_train.index)

precalculus_df = pd.DataFrame(data = subject_df_dict["precalculus"])
precalculus_train = precalculus_df.sample(frac=0.8,random_state=200) 
precalculus_validation = precalculus_df.drop(precalculus_train.index)


In [None]:
#subject classificator

y_subject_train = df_train.iloc[:,601]
X_subject_train = df_train.iloc[:,:600]

y_subject_test = df_validation.iloc[:,601]
X_subject_test = df_validation.iloc[:,:600]

#rbf SVM

#train
SVM = svm.SVC(max_iter=100000,C=500, gamma=10)
SVM.fit(X_subject_train, y_subject_train)

#validate
y_subject_pred = SVM.predict(X_subject_test)

print('Accuracy score: {:3f}'.format(accuracy_score(y_subject_test, y_subject_pred)))
print('Precision score: {:3f}'.format(precision_score(y_subject_test, y_subject_pred,average='macro',labels=['physics','geometry','data-mining','precalculus'])))
print('Recall score: {:3f}'.format(recall_score(y_subject_test, y_subject_pred,average='macro',labels=['physics','geometry','data-mining','precalculus'])))
print('F1 score: {:3f}'.format(f1_score(y_subject_test, y_subject_pred,average='macro',labels=['physics','geometry','data-mining','precalculus'])))



In [None]:
#geometry classificator

y_geometry_train = geometry_train.iloc[:,600]
X_geometry_train = geometry_train.iloc[:,:600]

y_geometry_test = geometry_validation.iloc[:,600]
X_geometry_test = geometry_validation.iloc[:,:600]

#rbf SVM

#train
SVM = svm.SVC(max_iter=100000,C=500, gamma=10)
SVM.fit(X_geometry_train, y_geometry_train)

#validate
y_geometry_pred = SVM.predict(X_geometry_test)

print('Accuracy score: {:3f}'.format(accuracy_score(y_geometry_test, y_geometry_pred)))
print('Precision score: {:3f}'.format(precision_score(y_geometry_test, y_geometry_pred)))
print('Recall score: {:3f}'.format(recall_score(y_geometry_test, y_geometry_pred)))
print('F1 score: {:3f}'.format(f1_score(y_geometry_test, y_geometry_pred)))

In [None]:
#physics classificator

y_physics_train = physics_train.iloc[:,600]
X_physics_train = physics_train.iloc[:,:600]

y_physics_test = physics_validation.iloc[:,600]
X_physics_test = physics_validation.iloc[:,:600]

#rbf SVM

#train
SVM = svm.SVC(max_iter=100000,C=500, gamma=10)
SVM.fit(X_physics_train, y_physics_train)

#validate
y_physics_pred = SVM.predict(X_physics_test)

print('Accuracy score: {:3f}'.format(accuracy_score(y_physics_test, y_physics_pred)))
print('Precision score: {:3f}'.format(precision_score(y_physics_test, y_physics_pred)))
print('Recall score: {:3f}'.format(recall_score(y_physics_test, y_physics_pred)))
print('F1 score: {:3f}'.format(f1_score(y_physics_test, y_physics_pred)))

In [None]:
#data_mining classificator

y_data_mining_train = data_mining_train.iloc[:,600]
X_data_mining_train = data_mining_train.iloc[:,:600]

y_data_mining_test = data_mining_validation.iloc[:,600]
X_data_mining_test = data_mining_validation.iloc[:,:600]

#rbf SVM

#train
RF = RandomForestClassifier(max_features = None)
RF.fit(X_data_mining_train, y_data_mining_train)

#validate
y_data_mining_pred = RF.predict(X_data_mining_test)

print('Accuracy score: {:3f}'.format(accuracy_score(y_data_mining_test, y_data_mining_pred)))
print('Precision score: {:3f}'.format(precision_score(y_data_mining_test, y_data_mining_pred)))
print('Recall score: {:3f}'.format(recall_score(y_data_mining_test, y_data_mining_pred)))
print('F1 score: {:3f}'.format(f1_score(y_data_mining_test, y_data_mining_pred)))

In [None]:
#precaulculus classificator

y_precalculus_train = precalculus_train.iloc[:,600]
X_precalculus_train = precalculus_train.iloc[:,:600]

y_precalculus_test = precalculus_validation.iloc[:,600]
X_precalculus_test = precalculus_validation.iloc[:,:600]

#rbf SVM

#train
SVM = svm.SVC(max_iter=100000,C=500, gamma=10)
SVM.fit(X_precalculus_train, y_precalculus_train)

#validate
y_precalculus_pred = SVM.predict(X_precalculus_test)

print('Accuracy score: {:3f}'.format(accuracy_score(y_precalculus_test, y_precalculus_pred)))
print('Precision score: {:3f}'.format(precision_score(y_precalculus_test, y_precalculus_pred)))
print('Recall score: {:3f}'.format(recall_score(y_precalculus_test, y_precalculus_pred)))
print('F1 score: {:3f}'.format(f1_score(y_precalculus_test, y_precalculus_pred)))