In [1]:
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannahz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
os.getcwd() 
os.chdir('/Users/hannahz/Desktop/G5055_Practicum_Project2/Data/Text_Model_Data')

In [3]:
df = pd.read_excel('definition_filled.xlsx')

## clean the definition column

In [4]:
#remove \n
df['definition'] = df['definition'].replace(r'\n','', regex=True) 
#remove multiple whitespace
df['definition'] = df['definition'].replace('\s+', ' ', regex=True)
#remove white space in the beginning and end
df['definition'] = df['definition'].str.strip()
stop_words_l=stopwords.words('english')
# removing special characters and stop words from the text and lower case
df['definition']=df['definition'].apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9$]','',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z0-9$]',' ',w).lower() not in stop_words_l) )

#remove white space 
df['definition'] = df['definition'].replace('\s+', ' ', regex=True)

In [13]:
# select the goal index out of index eg 1, 2
goal_num = df['Index']
df['goal_num'] = [re.findall(r'^(\d+).', goal)[0] for goal in goal_num]

## use tf-idf to do word embedding

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(df['definition'])
tfidf_vectors=tfidfvectoriser.transform(df['definition'])

#calculate similarity based on the vectors
pairwise_similarities=cosine_similarity(tfidf_vectors.toarray())

## convert similarity to a dataframe¶

In [7]:
def similardocs_one(doc_id,similarity_matrix):
    #find the index for the doc_id
    index = df.iloc[doc_id]['Index']
    #initiated related index
    related_index = []
    similar_score = similarity_matrix[doc_id]
    score_sort = np.sort(similar_score)[::-1]
    #find related doc_id, sort from most similar to least similar 
    similar_ix=np.argsort(similarity_matrix[doc_id])[::-1] # sort doc_id from most similar to least 
    #find corresponded index for doc_id
    for ix in similar_ix:
        if ix==doc_id:
            continue
        related_index.append(df.iloc[ix]["Index"])
    #create a dataframe of
    relation_df = pd.DataFrame({'indicator': np.repeat(index, len(related_index)),
                                'related_indicator': related_index,
                                'similarity_score': score_sort[1:]})
    return relation_df

list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome_tf_idf = pd.concat(list_of_dataframes)

In [8]:
outcome_tf_idf = outcome_tf_idf.rename(columns = {'similarity_score':'similarity_score_tf_idf','related_indicator':'related_indicator_tf_idf'})
outcome_tf_idf_new = outcome_tf_idf.groupby('indicator').head().reset_index(drop=True)

In [10]:
outcome_tf_idf_new.head()

Unnamed: 0,indicator,related_indicator_tf_idf,similarity_score_tf_idf
0,1.1.1,1.2.1,0.414994
1,1.1.1,10.2.1,0.193975
2,1.1.1,10.7.4,0.158748
3,1.1.1,16.8.1,0.155508
4,1.1.1,16.b.1,0.152244


## predict the belonged goal using word embedding result to evaluate the accuracy of word embedding?

In [9]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [10]:
# predict the belonged goal by the embedded definition, to check the accuracy of embedding
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors.toarray(), df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.579047619047619, 0.44, 0.4321818181818181, None)

## use bert to do word embeddings and calculate the similarity

In [11]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

document_embeddings = sbert_model.encode(df['definition'])

pairwise_similarities=cosine_similarity(document_embeddings)

In [12]:
list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome_bert = pd.concat(list_of_dataframes)

In [13]:
outcome_bert = outcome_bert.rename(columns = {'similarity_score':'similarity_score_bert','related_indicator':'related_indicator_bert'})
                                              

In [16]:
outcome_bert.shape

(60270, 3)

In [17]:
outcome_bert_new = outcome_bert.groupby('indicator').head().reset_index(drop=True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.5550732600732601, 0.44, 0.44217179311916155, None)

## using doc2vec

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hannahz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
# get the data into the format neede for doc2vec (tagged data )
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(df['definition'])]

In [32]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [34]:
model = Doc2Vec(vector_size = 300, alpha = 0.025,min_count=5,dm =1,epochs = 50) #alpha learning rate, 
model.build_vocab(tagged_data)

#train the model
model.train(tagged_data,total_examples=model.corpus_count,# number of documents 246
                        epochs=model.epochs)

#gain the word embeddings from the model
document_embeddings=np.zeros((246,300))
for i in range(len(document_embeddings)):
    document_embeddings[i]= model.docvecs[i]


  document_embeddings[i]= model.docvecs[i]


In [37]:
document_embeddings[1]

array([ 0.14669026,  0.27332506,  0.04717977, -0.02051441,  0.15534291,
       -0.42399359,  0.19401339,  0.23166928, -0.07226435,  0.36478797,
       -0.08121412, -0.6090855 , -0.28965861,  0.29935858,  0.09811122,
        0.01581472,  0.44516975,  0.12168184,  0.05832525,  0.09200405,
       -0.26412818,  0.17803343,  0.27489337,  0.2064822 ,  0.18687491,
       -0.0213223 , -0.15532492,  0.16765897, -0.31469011, -0.26587889,
        0.07205107,  0.1707761 ,  0.03952598,  0.15304857, -0.06317797,
        0.32680348,  0.39701566, -0.49648869,  0.02341006,  0.05154991,
       -0.16357231, -0.18082926, -0.23127496, -0.03820774,  0.35762927,
        0.01140099, -0.04247866,  0.1342148 , -0.19258121,  0.32343706,
       -0.14317697,  0.4729315 , -0.07715285,  0.16592571,  0.08860101,
        0.12184708,  0.1068109 , -0.57704014, -0.1756933 ,  0.0500539 ,
       -0.2759411 ,  0.10797344,  0.11577936, -0.20658927,  0.32633793,
        0.11390201,  0.25454697,  0.06832714, -0.17576191, -0.14

In [14]:
pairwise_similarities=cosine_similarity(document_embeddings)

list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome_doc2vec = pd.concat(list_of_dataframes)
outcome_doc2vec = outcome_doc2vec.rename(columns = {'similarity_score':'similarity_score_doc2vec','related_indicator':'related_indicator_doc2vec'})

In [15]:
outcome_doc2vec_new = outcome_doc2vec.groupby('indicator').head().reset_index(drop=True)
outcome_doc2vec_new.head()

Unnamed: 0,indicator,related_indicator_doc2vec,similarity_score_doc2vec
0,1.1.1,1.2.1,0.836483
1,1.1.1,11.1.1,0.789737
2,1.1.1,3.3.1,0.736435
3,1.1.1,11.6.2,0.727911
4,1.1.1,8.5.2,0.709573


In [25]:
X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))


(0.6265555555555555, 0.48, 0.48273726273726275, None)

## Word2vec

In [26]:
from gensim.models import Word2Vec

In [27]:
# generate a new cleaned column, which contain definition and index information
df['cleaned'] = df[['definition','Index']].apply(lambda x: ','.join(x.astype(str)), axis=1)
# get the data into the format neede for word2vec (list of list data )
sent = [row.split(',') for row in df['cleaned']]

In [28]:
model = Word2Vec(sent, min_count=1,workers=3, vector_size = 300, window =3, sg = 1)

In [29]:
#gain the word embeddings from the model
document_embeddings_word2vec=np.zeros((246,300))
for i in range(len(document_embeddings_word2vec)):
    document_embeddings_word2vec[i]= model.wv[df['Index'].iloc[i]].reshape((1,300))

In [28]:
pairwise_similarities=cosine_similarity(document_embeddings)

list_of_dataframes_word2vec = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome_word2vec = pd.concat(list_of_dataframes_word2vec)

In [29]:
outcome_word2vec = outcome_word2vec.rename(columns = {'similarity_score':'similarity_score_word2vec','related_indicator':'related_indicator_word2vec'})
outcome_word2vec_new = outcome_word2vec.groupby('indicator').head().reset_index(drop=True)
outcome_word2vec_new.head()

Unnamed: 0,indicator,related_indicator_word2vec,similarity_score_word2vec
0,1.1.1,1.2.1,0.824847
1,1.1.1,11.1.1,0.746653
2,1.1.1,10.2.1,0.692284
3,1.1.1,8.5.2,0.690044
4,1.1.1,11.6.2,0.689793


## Create the Matrix

In [31]:
 # merge tf-idf and bert
outcome_full = pd.concat([outcome_tf_idf_new,outcome_bert_new],axis=1)
# add doc2vec
outcome_full= pd.concat([outcome_full,outcome_doc2vec_new],axis=1)
# add word2vec
outcome_full= pd.concat([outcome_full,outcome_word2vec_new],axis=1)

NameError: name 'outcome_bert_new' is not defined

In [32]:
outcome_full = outcome_full[['indicator', 'related_indicator_tf_idf', 'similarity_score_tf_idf',
        'related_indicator_doc2vec', 'similarity_score_doc2vec',
        'related_indicator_word2vec', 'similarity_score_word2vec']]

NameError: name 'outcome_full' is not defined

In [97]:
outcome_full =  outcome_full.iloc[: , 3:]
outcome_full.head(10)

Unnamed: 0,indicator,related_indicator_tf_idf,similarity_score_tf_idf,related_indicator_doc2vec,similarity_score_doc2vec,related_indicator_word2vec,similarity_score_word2vec
0,1.1.1,1.2.1,0.414994,1.2.1,0.875192,1.2.1,0.858479
1,1.1.1,10.2.1,0.193975,11.1.1,0.871263,11.1.1,0.828153
2,1.1.1,10.7.4,0.158748,11.6.2,0.761997,11.6.2,0.70017
3,1.1.1,16.8.1,0.155508,17.3.2,0.753025,3.3.2,0.699983
4,1.1.1,16.b.1,0.152244,3.3.1,0.748638,3.3.1,0.692252
5,1.2.1,1.1.1,0.414994,1.1.1,0.875192,1.1.1,0.858479
6,1.2.1,1.b.1,0.153754,11.1.1,0.844428,9.1.1,0.806331
7,1.2.1,1.a.1,0.148588,9.1.1,0.812365,11.1.1,0.785132
8,1.2.1,9.1.1,0.14298,3.9.3,0.720787,11.6.2,0.740767
9,1.2.1,1.2.2,0.137517,3.6.1,0.71743,3.6.1,0.731003


In [98]:
outcome_full.to_excel('method_outcome.xlsx')