In [1]:
import thesis_helper

word2vecpath = "/Users/ivowings/Sync/Thesis/Code/Word2vec models/word2vec_s100_CBOW.model"

th = thesis_helper.Thesis_Helper(word2vecpath, 'en_core_web_lg')

import pandas as pd
import spacy 
nlp = spacy.load("en_core_web_lg")

from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np

from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import learning_curve

from statistics import mean

from nltk.tokenize import RegexpTokenizer

import time

seed = 456
threshold = 0.99


word2vec = Word2Vec.load("/Users/ivowings/Sync/Thesis/Code/Word2vec models/word2vec_s100_CBOW.model")


In [2]:
#Function to retrieve word2vec vectors from spacy
def word2vec_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = sum(word2vec.wv[tokens])
    return wordvectors
    

def word2vec_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = word2vec.wv[tokens]
    average = sum(wordvectors)/len(wordvectors)
    return average

In [3]:
path_context = "/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/n-grams context/Annotated/sentence_trigrams_lemmatized_context.csv"
path_trigrams = "/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/n-grams/Annotated/emscad_trigrams_lemmatized_taxonomy.csv"


df_context = pd.read_csv(path_context,sep=';')
df_context['allgrams'] = df_context['left_context'].str.lower() + ' ' + df_context['recovered_gram'].str.lower() + ' ' + df_context['right_context'].str.lower()
df_context = df_context[['allgrams', 'label']]

df_trigrams = pd.read_csv(path_trigrams, sep=',')

df = pd.concat([df_trigrams,df_context]).reset_index(drop=True)
df = df.drop(columns=['length'])
#Replacing missing labels with -1
df['label'] = df['label'].fillna(-1).astype(int)

x_vectors = pd.DataFrame(df['allgrams'].progress_apply(word2vec_retriever_average))
x_vectors = x_vectors['allgrams'].progress_apply(pd.Series)

df = x_vectors.join(df['label'])

100%|██████████| 1056578/1056578 [00:38<00:00, 27186.85it/s]
100%|██████████| 1056578/1056578 [02:57<00:00, 5938.83it/s] 


In [4]:
%%time

import time

iterate = True
starttime = time.time()

print('Starting the semi supervised training method')
while iterate == True:
    
    iterationstarttime = time.time()
    
    #Splitting df into unlabeled, labeled and labeled into x and y
    X_unlabeled = df[df.label==-1].drop(columns=['label'])
    labeled_data = df[df.label>-1]
    x = labeled_data.drop(columns=['label'])
    y = np.ravel(labeled_data[['label']])
    
    print('Number of unlabeled points ',X_unlabeled.shape[0])

    x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2,random_state=seed)

    #Training classifier on labeled data
    GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
         max_depth=1, random_state=seed).fit(x_train, y_train) 

    #Retrieving the scores from the model
    print('Precision Gradient Boosting ',precision_score(y_test, GBC.predict(x_test), average='macro'))
    print("Recall Gradient Boosting ",recall_score(y_test, GBC.predict(x_test), average='macro'))
    print("F1 score Gradient Boosting ",f1_score(y_test, GBC.predict(x_test), average='macro'))

    #Retrieving the probabilities of the classes on the unlabeled dataset
    prediction_probabilities = pd.DataFrame(GBC.predict_proba(X_unlabeled),columns=['no_skill', 'soft_skill', 'hard_skill'])
    
    #Matching the index in order to see where the final labels belong
    prediction_probabilities.index = X_unlabeled.index

    #Only selecting predictions which are above the threshold
    above_threshold_predictions = pd.concat([
                    prediction_probabilities.loc[prediction_probabilities['no_skill'] > threshold],
                    prediction_probabilities.loc[prediction_probabilities['soft_skill'] > threshold],
                    prediction_probabilities.loc[prediction_probabilities['hard_skill'] > threshold]
                    ],axis=0)
    
    if len(above_threshold_predictions) == 0:
        print('No predictions have been found, consider lowering the threshold')
        iterate == False
    
    #Labeling the classes with 5,6,7 in order to move away from the probabilities (0-1)
    above_threshold_predictions.no_skill[above_threshold_predictions.no_skill> threshold]  = 5
    above_threshold_predictions.soft_skill[above_threshold_predictions.soft_skill> threshold]  = 6
    above_threshold_predictions.hard_skill[above_threshold_predictions.hard_skill> threshold]  = 7

    #Selecting the max value to remove the probabilities
    above_threshold_predictions['label'] = above_threshold_predictions.max(axis=1)
    #Turning the label back into the original class
    above_threshold_predictions['label'] = above_threshold_predictions['label'].astype(int)-5
    above_threshold_predictions = above_threshold_predictions[['label']]

    #Joining the predicted labels on the existing df
    df = df.join(above_threshold_predictions, lsuffix='_df', rsuffix='_pred')
    df['label'] = df[["label_df", "label_pred"]].max(axis=1).astype(int)
    df = df.drop(columns=["label_df", "label_pred"])

    X_unlabeled = df[df.label==-1].drop(columns=['label'])
    
    iterationendtime = time.time()
    
    
    print('After completing the run', X_unlabeled.shape[0], '')
    print('Iteration took ',(iterationendtime- iterationstarttime)/60, ' minutes \n\n')

endtime = time.time()
print('Semi supervised method time ', (endtime - starttime)/60, ' minutes.')

Starting the semi supervised training method
Number of unlabeled points  1055578
Precision Gradient Boosting  0.5688820312107984
Recall Gradient Boosting  0.551522180948052
F1 score Gradient Boosting  0.5575757575757576
After completing the run 823113 
Iteration took  10.225259065628052  seconds 


Number of unlabeled points  823113


KeyboardInterrupt: 

In [None]:
df.to_csv("/Users/ivowings/Desktop/semi_supervised.csv")