In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

import thesis_helper
functions = thesis_helper.Thesis_Helper()

from gensim.models import Word2Vec
import gensim.downloader as api
word2vec = api.load("glove-wiki-gigaword-300") 


In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [3]:
df = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df['left_context'] = df['left_context'].fillna('empty')
df['right_context'] = df['right_context'].fillna('empty')

df['concatenated'] = df['left_context'] + ' | ' + df['candidate_skill'] + ' | ' + df['right_context']
print('Number of annotated rows ',df.shape[0])

Number of annotated rows  20740


In [4]:
from nltk import RegexpTokenizer
def word2vec_vocab_check(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    try:
        word2vec.wv[tokens]
        return True
    except:
        return False

#Function to retrieve word2vec vectors from spacy
def word2vec_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = sum(word2vec.wv[tokens])
    return wordvectors
    
def word2vec_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = word2vec.wv[tokens]
    average = sum(wordvectors)/len(wordvectors)
    return average

In [5]:
#Removing out of vocabulary word2vec words
df['vocab_check_left'] = df['left_context'].progress_apply(word2vec_vocab_check)
df['vocab_check_middle'] = df['candidate_skill'].progress_apply(word2vec_vocab_check)
df['vocab_check_right'] = df['right_context'].progress_apply(word2vec_vocab_check)
df = df[(df.vocab_check_left==True) & (df.vocab_check_middle==True) & (df.vocab_check_right==True)]
df = df.drop(columns=['vocab_check_left', 'vocab_check_middle', 'vocab_check_right'])

  word2vec.wv[tokens]
100%|██████████| 20740/20740 [00:00<00:00, 42088.18it/s]
100%|██████████| 20740/20740 [00:00<00:00, 53557.57it/s]
100%|██████████| 20740/20740 [00:00<00:00, 45956.55it/s]


In [11]:
mode = word2vec_retriever_average
#Retrieving the word2vec vectors
x_left = pd.DataFrame(df['left_context'].progress_apply(mode))
x_left = x_left['left_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_right = pd.DataFrame(df['right_context'].progress_apply(mode))
x_right = x_right['right_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_middle = pd.DataFrame(df['candidate_skill'].progress_apply(mode))
x_middle = x_middle['candidate_skill'].progress_apply(pd.Series)

x = x_left
x['sep'] = 222
x = x.join(x_middle,lsuffix='_left', rsuffix='_middle')
x['sep2'] = 222
x = x.join(x_right,lsuffix='_middle', rsuffix='_right')

  wordvectors = word2vec.wv[tokens]
100%|██████████| 19253/19253 [00:02<00:00, 7502.91it/s] 
100%|██████████| 19253/19253 [00:13<00:00, 1446.60it/s]
100%|██████████| 19253/19253 [00:01<00:00, 13483.50it/s]
100%|██████████| 19253/19253 [00:06<00:00, 2757.49it/s]
100%|██████████| 19253/19253 [00:01<00:00, 15438.17it/s]
100%|██████████| 19253/19253 [00:07<00:00, 2554.33it/s]


In [12]:
%%time

y=df['label']
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 29.0min finished
 17%|█▋        | 1/6 [29:00<2:25:04, 1740.85s/it]

We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 47.9min finished
 33%|███▎      | 2/6 [1:16:55<2:40:30, 2407.57s/it]

We are at classifier  SGDClassifier(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   14.7s finished
 50%|█████     | 3/6 [1:17:10<1:05:45, 1315.05s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.3min finished
 67%|██████▋   | 4/6 [1:22:27<30:42, 921.28s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.0min finished
 83%|████████▎ | 5/6 [1:35:27<14:30, 870.30s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 62.4min finished
100%|██████████| 6/6 [2:37:54<00:00, 1579.06s/it]


CPU times: user 956 ms, sys: 2.63 s, total: 3.59 s
Wall time: 2h 37min 54s


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.724219,0.681418,0.686828
1,GBC,0.740304,0.586837,0.618528
2,SGD,,,
3,RF,0.888826,0.70558,0.743193
4,SVM,0.260427,0.333333,0.292404
5,MLP,0.69626,0.694736,0.681072


In [13]:
df['pos'] = df['candidate_skill'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['candidate_skill'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 19253/19253 [03:18<00:00, 96.78it/s] 
100%|██████████| 19253/19253 [03:35<00:00, 89.17it/s] 
100%|██████████| 19253/19253 [01:52<00:00, 170.68it/s]
100%|██████████| 19253/19253 [01:53<00:00, 169.91it/s]


In [None]:
x[x.isna().any(axis=1)].index

In [14]:
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 34.1min finished
 17%|█▋        | 1/6 [34:08<2:50:41, 2048.28s/it]

We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 35.6min finished
 33%|███▎      | 2/6 [1:09:43<2:19:57, 2099.49s/it]

We are at classifier  SGDClassifier(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.0s finished
 50%|█████     | 3/6 [1:09:48<57:09, 1143.16s/it]  [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.6min finished
 67%|██████▋   | 4/6 [1:13:26<25:55, 777.77s/it] [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.0min finished
 83%|████████▎ | 5/6 [1:22:24<11:31, 691.22s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 38.3min finished
100%|██████████| 6/6 [2:00:45<00:00, 1207.52s/it]


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.728915,0.698318,0.696522
1,GBC,0.745919,0.585045,0.616304
2,SGD,,,
3,RF,0.882573,0.704823,0.744965
4,SVM,0.260427,0.333333,0.292404
5,MLP,0.699167,0.711972,0.69093
