In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

import thesis_helper
functions = thesis_helper.Thesis_Helper()

from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence

# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')

#bert-large-uncased


In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'
all_ngrams = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal'

In [3]:
df = pd.read_csv(annotations,sep=',')

df['left_context'] = df['left_context'].str.strip()
df['candidate_skill'] = df['candidate_skill'].str.strip()
df['right_context'] = df['right_context'].str.strip()

#Filling any empty context columns with 'empty'
df['left_context'] = df['left_context'].fillna('empty')
df['right_context'] = df['right_context'].fillna('empty')

df['concatenated'] = df['left_context'] + ' | ' + df['candidate_skill'] + ' | ' + df['right_context']
print('Number of annotated rows ',df.shape[0])

df.head()

Number of annotated rows  20674


Unnamed: 0,left_context,candidate_skill,right_context,label,concatenated
0,able to work,flexible,schedule including am,0,able to work | flexible | schedule including am
1,roles played by,memcached,solr cassandra redis,2,roles played by | memcached | solr cassandra r...
2,influencing,communication,and,1,influencing | communication | and
3,customer driven and,motivated,to make people s,1,customer driven and | motivated | to make peop...
4,ability to,collaborate,with product,1,ability to | collaborate | with product


In [4]:
ngrams = functions.folder_reader(all_ngrams)
ngrams = ngrams['concatenated'].str.split('|',expand=True)
ngrams = ngrams.rename(columns={0:'left_context', 1:'candidate_skill', 2:'right_context'})

ngrams['left_context'] = ngrams['left_context'].str.strip()
ngrams['candidate_skill'] = ngrams['candidate_skill'].str.strip()
ngrams['right_context'] = ngrams['right_context'].str.strip()

ngrams['left_context'] = ngrams['left_context'].fillna('empty')
ngrams['right_context'] = ngrams['right_context'].fillna('empty')

ngrams['label'] = None

ngrams['concatenated'] = ngrams['left_context'] + ' | ' + ngrams['candidate_skill'] + ' | ' + ngrams['right_context']

ngrams = ngrams.drop_duplicates()
ngrams.head()
print(ngrams.shape[0])

5390784


In [5]:
df = df[['concatenated', 'label']]
ngrams = ngrams[['concatenated', 'label']]

In [6]:
df = pd.concat([df,
               ngrams])
df.head()
print(df.shape[0]) 

5411458


In [7]:
df = df.drop_duplicates(subset=['concatenated'],keep='first')
df.shape[0]



5397992

In [8]:
df=df[1:200001]
df

Unnamed: 0,concatenated,label
1,roles played by | memcached | solr cassandra r...,2
2,influencing | communication | and,1
3,customer driven and | motivated | to make peop...,1
4,ability to | collaborate | with product,1
5,skills high degree of | confidentiality | atte...,0
...,...,...
179462,of | java linux shell | scripting,
179463,raleigh nc | skills and attributes creative | ...,
179464,our brand new approach to | solving the proble...,
179465,through project from concept development | art...,


In [9]:
def bert_embedder(text):

    string = Sentence(text)
    embedding.embed(string)

    # Creating a list which stores the indexes of the | symbols
    bar_indexes = []
    #Creating a list which stores the embedding_tensors
    embedding_tensors = []

    #Checking the sentence object for the | symbols and storing their indexes
    for x in range(1,len(string)+1):
        if '|' in str(string.get_token(x)):
            bar_indexes.append(x)

    #Collecting the embeddings for every index between the indexes in bar_indexes
    word_embedding_indexes = range(bar_indexes[0]+1,bar_indexes[1])
    for x in word_embedding_indexes:
        embedding_tensors.append(pd.Series(string[x].embedding))
        embedding_tensors.append(222)

    #Removing last 666 from list
    embedding_tensors.pop()

    #Turning the elements from embedding_tensors into dataframe rows
    row = pd.DataFrame()
    for x in range(0,len(embedding_tensors)):
        row = row.append(pd.DataFrame(pd.Series(embedding_tensors[x])))

    row = row.transpose().reset_index(drop=True)
    
    #Changing the column names in order to make pd.concat work later
    row.columns = [x for x in range(0,len(row.columns))]
    return row

In [10]:
df['embeddings'] = df['concatenated'].progress_apply(bert_embedder)

100%|██████████| 200000/200000 [9:53:01<00:00,  5.62it/s]      


In [11]:
x_bert = pd.concat(df['embeddings'].tolist()).reset_index(drop=True)
x_bert = x_bert.fillna(0)

In [14]:
import numpy as np
test = x_bert.astype(np.float64)

In [22]:
df3 = df.reset_index(drop=True)
df3 = df3[['concatenated','label']]
df3.head()

Unnamed: 0,concatenated,label
0,roles played by | memcached | solr cassandra r...,2
1,influencing | communication | and,1
2,customer driven and | motivated | to make peop...,1
3,ability to | collaborate | with product,1
4,skills high degree of | confidentiality | atte...,0


In [29]:
finalframe = df3.join(x_bert)
finalframe['label'] = finalframe['label'].fillna(-1)
finalframe.head()

Unnamed: 0,concatenated,label,0,1,2,3,4,5,6,7,...,3065,3066,3067,3068,3069,3070,3071,3072,3073,3074
0,roles played by | memcached | solr cassandra r...,2,0.742518,-1.019007,-0.139033,-0.602936,0.61055,0.501765,0.369847,-0.171284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,influencing | communication | and,1,1.05136,-0.897738,0.213438,-1.248636,1.270297,0.269889,1.188446,0.272905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,customer driven and | motivated | to make peop...,1,1.107537,-1.146388,0.437793,-0.75668,1.024668,0.038017,0.694363,0.224228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ability to | collaborate | with product,1,1.061473,-0.770206,0.249639,-0.916371,1.147829,-0.024134,1.258022,0.663062,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,skills high degree of | confidentiality | atte...,0,0.747921,-0.245444,0.009621,-0.593414,0.768309,-0.077666,1.042266,0.319561,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
is_NaN = finalframe.isnull()
row_has_NaN = is_NaN.any(axis=1)
finalframe[row_has_NaN]

In [2]:
finalframe = pd.read_csv('/Users/ivowings/Desktop/bert_embeddings_200k.csv')
finalframe.head()

Unnamed: 0.1,Unnamed: 0,concatenated,label,0,1,2,3,4,5,6,...,3065,3066,3067,3068,3069,3070,3071,3072,3073,3074
0,0,roles played by | memcached | solr cassandra r...,2,0.742518,-1.019007,-0.139033,-0.602936,0.61055,0.501765,0.369847,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,influencing | communication | and,1,1.05136,-0.897738,0.213438,-1.248636,1.270297,0.269889,1.188446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,customer driven and | motivated | to make peop...,1,1.107537,-1.146388,0.437793,-0.75668,1.024668,0.038017,0.694363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,ability to | collaborate | with product,1,1.061473,-0.770206,0.249639,-0.916371,1.147829,-0.024134,1.258022,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,skills high degree of | confidentiality | atte...,0,0.747921,-0.245444,0.009621,-0.593414,0.768309,-0.077666,1.042266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
x = finalframe.drop(columns=['concatenated', 'label','Unnamed: 0'])
y = finalframe['label']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3065,3066,3067,3068,3069,3070,3071,3072,3073,3074
0,0.742518,-1.019007,-0.139033,-0.602936,0.61055,0.501765,0.369847,-0.171284,-0.139418,-0.484624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.05136,-0.897738,0.213438,-1.248636,1.270297,0.269889,1.188446,0.272905,-0.330899,-0.3902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.107537,-1.146388,0.437793,-0.75668,1.024668,0.038017,0.694363,0.224228,-0.571021,-0.944707,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.061473,-0.770206,0.249639,-0.916371,1.147829,-0.024134,1.258022,0.663062,-0.407397,-0.978905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.747921,-0.245444,0.009621,-0.593414,0.768309,-0.077666,1.042266,0.319561,-0.399776,-0.861208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.semi_supervised import SelfTrainingClassifier

#use 0 as max_iter to only run the classifier once
self_training_model = SelfTrainingClassifier(functions.LR, threshold=0.99, max_iter=0, verbose=True)
self_training_model.fit(x,y)

End of iteration 1, added 146902 new labels.


KeyboardInterrupt: 

In [9]:
len(self_training_model.transduction_)

200000

In [13]:
banaan = pd.DataFrame(self_training_model.transduction_,columns=['label'])

In [14]:
banaan[banaan.label==-1]

Unnamed: 0,label
20673,-1
20686,-1
20692,-1
20693,-1
20696,-1
...,...
199953,-1
199957,-1
199978,-1
199983,-1
