In [1]:
import pandas as pd
import spacy 

from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np

import fasttext

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import learning_curve

from statistics import mean

from nltk.tokenize import RegexpTokenizer

import thesis_helper
functions = thesis_helper.Thesis_Helper()

fasttext = fasttext.load_model("/Users/ivowings/Downloads/cc.en.300.bin")
#fasttext = fasttext.load_model("C:/Users/Ivo/Downloads/cc.en.300.bin/cc.en.300.bin")





In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'
#annotations = 'D:/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [3]:
df = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df['left_context'] = df['left_context'].fillna('empty')
df['right_context'] = df['right_context'].fillna('empty')

df['concatenated'] = df['left_context'] + ' | ' + df['candidate_skill'] + ' | ' + df['right_context']
print('Number of annotated rows ',df.shape[0])

Number of annotated rows  20740


In [4]:
#Function to retrieve word2vec vectors from spacy
def fasttext_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)
    return wordvectors
    

def fasttext_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)/len(wordvectors)
    return wordvectors

mode = fasttext_retriever_average

In [5]:

#Retrieving the word2vec vectors
x_left = pd.DataFrame(df['left_context'].progress_apply(mode))
x_left = x_left['left_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_right = pd.DataFrame(df['right_context'].progress_apply(mode))
x_right = x_right['right_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_middle = pd.DataFrame(df['candidate_skill'].progress_apply(mode))
x_middle = x_middle['candidate_skill'].progress_apply(pd.Series)

x = x_left
x['sep'] = 222
x = x.join(x_middle,lsuffix='_left', rsuffix='_middle')
x['sep2'] = 222
x = x.join(x_right,lsuffix='_middle', rsuffix='_right')

100%|██████████| 20740/20740 [00:01<00:00, 17312.04it/s]
100%|██████████| 20740/20740 [00:03<00:00, 5641.82it/s] 
100%|██████████| 20740/20740 [00:01<00:00, 18593.83it/s]
100%|██████████| 20740/20740 [00:03<00:00, 5783.48it/s] 
100%|██████████| 20740/20740 [00:00<00:00, 31904.80it/s]
100%|██████████| 20740/20740 [00:03<00:00, 5983.30it/s] 


Unnamed: 0,0_left,1_left,2_left,3_left,4_left,5_left,6_left,7_left,8_left,9_left,...,290,291,292,293,294,295,296,297,298,299
0,0.003516,0.035061,0.041578,0.065415,-0.083281,-0.031485,-0.048344,0.031984,-0.008720,-0.022635,...,0.064867,-0.067286,-0.248340,0.053523,0.087156,0.077296,-0.009412,-0.076232,-0.084672,0.051010
1,-0.008846,0.125282,0.063383,-0.059112,0.045255,0.036968,-0.144747,-0.002864,-0.008839,0.090855,...,0.000442,0.067527,-0.183973,0.083701,0.061003,0.054565,0.139225,0.110751,-0.075219,0.063857
2,-0.034497,-0.047639,0.046297,0.020177,-0.022965,-0.013309,-0.016021,-0.013432,0.016687,-0.014189,...,0.060722,-0.035507,-0.023820,-0.027755,-0.026023,-0.038051,-0.021103,-0.011591,-0.041129,0.036252
3,-0.027974,-0.004037,0.046416,0.043502,-0.054530,0.035032,0.023011,0.001187,-0.028867,0.018841,...,0.145383,-0.019659,-0.230063,0.016658,-0.042835,0.019429,0.017573,0.032204,0.033721,0.036456
4,-0.008733,0.010493,-0.030028,0.006583,-0.073132,-0.047919,-0.065180,0.020422,-0.057277,-0.034395,...,0.038851,0.009166,0.004940,0.018248,0.007943,0.013399,-0.033390,0.017307,-0.040692,0.028663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20735,0.107442,-0.257640,0.002259,0.013310,-0.030812,0.001785,0.146990,0.040267,-0.119377,-0.000322,...,0.042647,0.020171,-0.046246,0.017164,-0.019393,-0.004605,-0.006705,0.077649,-0.024118,0.001900
20736,-0.014222,0.009650,0.016613,0.029955,-0.039008,0.005737,0.026447,0.001555,-0.029694,-0.015564,...,0.029468,-0.000081,-0.024253,0.018406,-0.040625,-0.003001,0.002514,0.064020,-0.009169,0.040793
20737,0.000959,-0.051468,0.003372,0.027485,-0.055340,-0.010014,0.060629,0.000468,-0.029965,0.005528,...,0.060091,-0.002339,-0.013451,-0.004164,0.007344,-0.025554,0.025025,0.025374,-0.014409,0.002991
20738,0.007394,-0.013732,-0.060527,-0.076012,-0.035817,0.010145,0.044607,0.009710,-0.037294,-0.005947,...,0.088448,-0.008324,-0.139382,-0.000698,0.007778,-0.005039,0.025633,0.263251,-0.065509,-0.002673


In [6]:
%%time


functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 11.8min finished
 17%|█▋        | 1/6 [11:46<58:50, 706.03s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 29.9min finished
 33%|███▎      | 2/6 [41:42<1:29:50, 1347.64s/it]

We are at classifier  SGDClassifier(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.2s finished
 50%|█████     | 3/6 [41:46<36:41, 733.78s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.3min finished
 67%|██████▋   | 4/6 [45:04<17:24, 522.39s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.4min finished
 83%|████████▎ | 5/6 [54:29<08:57, 537.67s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 34.7min finished
100%|██████████| 6/6 [1:29:11<00:00, 891.91s/it] 

CPU times: user 592 ms, sys: 580 ms, total: 1.17 s
Wall time: 1h 29min 11s





Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.767579,0.65215,0.686812
1,GBC,0.633194,0.516112,0.529205
2,SGD,,,
3,RF,0.881174,0.709054,0.75135
4,SVM,0.2572,0.333333,0.290359
5,MLP,0.734169,0.679819,0.690674


In [7]:
df['pos'] = df['candidate_skill'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['candidate_skill'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 20740/20740 [01:44<00:00, 198.29it/s]
100%|██████████| 20740/20740 [01:42<00:00, 201.68it/s]
100%|██████████| 20740/20740 [01:50<00:00, 187.64it/s]
100%|██████████| 20740/20740 [01:50<00:00, 187.00it/s]


In [8]:
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 17.7min finished
 17%|█▋        | 1/6 [17:41<1:28:27, 1061.57s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 33.0min finished
 33%|███▎      | 2/6 [50:40<1:46:44, 1601.21s/it]

We are at classifier  SGDClassifier(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished
 50%|█████     | 3/6 [50:45<43:36, 872.33s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.2min finished
 67%|██████▋   | 4/6 [53:57<20:07, 603.88s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 10.2min finished
 83%|████████▎ | 5/6 [1:04:07<10:05, 605.86s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 41.7min finished
100%|██████████| 6/6 [1:45:52<00:00, 1058.69s/it]


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.788122,0.675633,0.706604
1,GBC,0.646789,0.521779,0.531584
2,SGD,,,
3,RF,0.890113,0.713031,0.755025
4,SVM,0.2572,0.333333,0.290359
5,MLP,0.768691,0.709561,0.722572
