In [2]:
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannahz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.set_option('display.max_colwidth', None)

In [2]:
#download filled definition from google doc as fill.xlsx(csv shows weird character), read in to pandas
new_def = pd.read_excel('fill-3.xlsx')
# combined with the original gaol-target-indicator-definition
df = pd.read_csv('goal_target_indicator_definition.csv')
# use the new filled definition column
df['definition'] = new_def['definition']
#drop the pdf_name column
df = df.drop(['pdf_name'], axis = 1)


In [5]:
# save it as xlsx (csv shows weird character)
#df.to_excel('definition_filled.xlsx', index = False)

In [3]:
df = pd.read_excel('definition_filled.xlsx')

### clean the definition column

In [12]:
#remove \n
df['definition'] = df['definition'].replace(r'\n','', regex=True) 
#remove multiple whitespace
df['definition'] = df['definition'].replace('\s+', ' ', regex=True)
#remove white space in the beginning and end
df['definition'] = df['definition'].str.strip()
stop_words_l=stopwords.words('english')
# removing special characters and stop words from the text and lower case
df['definition']=df['definition'].apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9$]','',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z0-9$]',' ',w).lower() not in stop_words_l) )

#remove white space 
df['definition'] = df['definition'].replace('\s+', ' ', regex=True)

In [8]:
# save the cleaned definitions as xlsx 
#df.to_excel('definition_cleaned.xlsx', index = False)

In [5]:
# select the goal index out of index eg 1, 2
goal_num = df['Index']
df['goal_num'] = [re.findall(r'^(\d+).', goal)[0] for goal in goal_num]

In [13]:
df['definition']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

# use tf-idf to do word embedding

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidfvectoriser=TfidfVectorizer()
tfidfvectoriser.fit(df['definition'])
tfidf_vectors=tfidfvectoriser.transform(df['definition'])

#calculate similarity based on the vectors
pairwise_similarities=cosine_similarity(tfidf_vectors.toarray())

In [15]:
my_vec = pd.DataFrame(tfidf_vectors.toarray())
my_vec.columns = tfidfvectoriser.vocabulary_

In [16]:
my_vec.columns

Index(['indicator', 'proportion', 'population', 'international', 'poverty',
       'line', 'defined', 'percentage', 'living', 'less',
       ...
       'awaiting', 'dollar', 'developingcountries', 'press', 'andadministered',
       'indeveloping', 'housingcensus', 'andhousing', 'registers', 'records'],
      dtype='object', length=4128)

### convert similarity to a dataframe

In [17]:
def similardocs_one(doc_id,similarity_matrix):
    #find the index for the doc_id
    index = df.iloc[doc_id]['Index']
    #initiated related index
    related_index = []
    similar_score = similarity_matrix[doc_id]
    score_sort = np.sort(similar_score)[::-1]
    #find related doc_id, sort from most similar to least similar 
    similar_ix=np.argsort(similarity_matrix[doc_id])[::-1] # sort doc_id from most similar to least 
    #find corresponded index for doc_id
    for ix in similar_ix:
        if ix==doc_id:
            continue
        related_index.append(df.iloc[ix]["Index"])
    #create a dataframe of
    relation_df = pd.DataFrame({'indicator': np.repeat(index, len(related_index)),
                                'related_indicator': related_index,
                                'similarity_score': score_sort[1:]})
    return relation_df

list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome = pd.concat(list_of_dataframes)

In [22]:
outcome.shape

(60270, 3)

In [21]:
outcome[outcome['indicator'] == '1.1.1']

Unnamed: 0,indicator,related_indicator,similarity_score
0,1.1.1,1.2.1,0.414994
1,1.1.1,10.2.1,0.193975
2,1.1.1,10.7.4,0.158748
3,1.1.1,16.8.1,0.155508
4,1.1.1,16.b.1,0.152244
5,1.1.1,10.3.1,0.152244
6,1.1.1,16.1.1,0.147457
7,1.1.1,16.1.2,0.138353
8,1.1.1,16.4.2,0.137039
9,1.1.1,10.6.1,0.122047


### predict the belonged goal using word embedding result to evaluate the accuracy of word embedding?

In [49]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [50]:
# predict the belonged goal by the embedded definition, to check the accuracy of embedding
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors.toarray(), df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.579047619047619, 0.44, 0.4321818181818181, None)

# use bert to do word embeddings and calculate the similarity

In [80]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

document_embeddings = sbert_model.encode(df['definition'])

pairwise_similarities=cosine_similarity(document_embeddings)


In [81]:
list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome = pd.concat(list_of_dataframes)

In [82]:
outcome[outcome['indicator'] == '1.1.1']

Unnamed: 0,indicator,related_indicator,similarity_score
0,1.1.1,17.2.1,0.791766
1,1.1.1,10.a.1,0.767115
2,1.1.1,17.12.1,0.74359
3,1.1.1,10.c.1,0.738721
4,1.1.1,11.6.2,0.703645
5,1.1.1,10.2.1,0.69854
6,1.1.1,1.b.1,0.680495
7,1.1.1,17.19.2,0.660144
8,1.1.1,1.2.1,0.651749
9,1.1.1,10.4.2,0.650623


In [29]:
X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.5550732600732601, 0.44, 0.44217179311916155, None)

## using doc2vec

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hannahz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# get the data into the format neede for doc2vec (tagged data )
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(df['definition'])]

In [12]:
model = Doc2Vec(vector_size = 300, alpha = 0.025,min_count=5,dm =1,epochs = 50) #alpha learning rate, 
model.build_vocab(tagged_data)

#train the model
model.train(tagged_data,total_examples=model.corpus_count,# number of documents 246
                        epochs=model.epochs)

#gain the word embeddings from the model
document_embeddings=np.zeros((246,300))
for i in range(len(document_embeddings)):
    document_embeddings[i]= model.docvecs[i]

  document_embeddings[i]= model.docvecs[i]


In [15]:
document_embeddings

array([[ 0.0457717 ,  0.03620621,  0.03652849, ...,  0.04891679,
         0.03314833,  0.01949126],
       [ 0.18528946,  0.1902861 ,  0.03061791, ...,  0.12184305,
         0.27267104,  0.08396541],
       [ 0.21567142,  0.32836252, -0.0282975 , ...,  0.12741885,
         0.01630357, -0.1264587 ],
       ...,
       [ 0.02065087,  0.32834831,  0.13657658, ..., -0.15822282,
        -0.02846087, -0.12677099],
       [-0.1197174 ,  0.12338128, -0.05176521, ..., -0.11801478,
         0.0294875 , -0.0245464 ],
       [ 0.01328554,  0.10891927,  0.01199636, ...,  0.02036428,
        -0.00905238,  0.04833864]])

In [73]:
pairwise_similarities=cosine_similarity(document_embeddings)

list_of_dataframes = [similardocs_one(x,pairwise_similarities) for x in range(len(df['Index']))]
outcome = pd.concat(list_of_dataframes)

In [76]:
outcome[outcome['indicator'] == '1.1.1']

Unnamed: 0,indicator,related_indicator,similarity_score
0,1.1.1,1.2.1,0.835552
1,1.1.1,11.1.1,0.789279
2,1.1.1,8.5.2,0.70251
3,1.1.1,11.6.2,0.689911
4,1.1.1,3.3.1,0.675195
5,1.1.1,10.2.1,0.672549
6,1.1.1,3.3.2,0.66641
7,1.1.1,3.3.3,0.661915
8,1.1.1,3.3.5,0.650812
9,1.1.1,10.4.2,0.644448


In [77]:
X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.5855555555555556, 0.46, 0.4632307692307693, None)

## some try out code
## clean indicator column

In [365]:
#remove \n
df['Indicator'] = df['Indicator'].replace(r'\n','', regex=True) 
#remove multiple whitespace
df['Indicator'] = df['Indicator'].replace('\s+', ' ', regex=True)
#remove white space in the beginning and end
df['Indicator'] = df['Indicator'].str.strip()

df['Indicator']=df['Indicator'].apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]','',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
#remove white space 
df['Indicator'] = df['Indicator'].replace('\s+', ' ', regex=True)


In [236]:
# select the target index out of index eg 1.1, 1.2
index_num = df['Index']
df['target_num'] = [num[:-2] for num in index_num]

In [239]:
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(df['Indicator'])]

In [240]:
len(tagged_data)

246

In [244]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [260]:
f1score = []
model = []
for vector_size in [500]:
    for alpha in[0.025]:
        for min_count in [5]:
            for dm in [1]:
                for epochs in [500]:
                    #record the model parameter
                    model.append(f'vector_size:{vector_size},alpha:{alpha},min_count:{min_count},dm:{dm},epochs:{epochs}')
                    #initiate the model
                    model_d2v = Doc2Vec(vector_size = vector_size ,alpha = alpha, min_count = min_count, dm = dm, epochs = epochs) #alpha learning rate, 
                    model_d2v.build_vocab(tagged_data)
                    #train the model
                    model_d2v.train(tagged_data,
                        total_examples=model_d2v.corpus_count,# number of documents 246
                        epochs=model_d2v.epochs)
                    #gain the word embeddings from the model
                    document_embeddings=np.zeros((df.shape[0],vector_size))
                    for i in range(len(document_embeddings)):
                        document_embeddings[i]= model_d2v.docvecs[i]
                    # evaluate the model
                    X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.33, random_state=42)
                    clf = RandomForestClassifier(max_depth=100, random_state=123)
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_test)
                    f1score.append(f1_score(y_test, y_pred, average='weighted'))

model_score = pd.DataFrame({'model' : model,'f1score' : f1score})
model_score

  document_embeddings[i]= model_d2v.docvecs[i]


Unnamed: 0,model,f1score
0,"vector_size:500,alpha:0.025,min_count:5,dm:1,epochs:500",0.178189


In [192]:
f1score = []
model = []
for vector_size in [100,200,300]:
    for alpha in[0.020, 0.025]:
        for min_count in [1,3,5]:
            for dm in [0, 1]:
                for epochs in [20,100,500,1000]:
                    #record the model parameter
                    model.append(f'vector_size:{vector_size},alpha:{alpha},min_count:{min_count},dm:{dm},epochs:{epochs}')
                    #initiate the model
                    model_d2v = Doc2Vec(vector_size = vector_size ,alpha = alpha, min_count = min_count, dm = dm, epochs = epochs) #alpha learning rate, 
                    model_d2v.build_vocab(tagged_data)
                    #train the model
                    model_d2v.train(tagged_data,
                        total_examples=model_d2v.corpus_count,# number of documents 246
                        epochs=model_d2v.epochs)
                    #gain the word embeddings from the model
                    document_embeddings=np.zeros((df.shape[0],vector_size))
                    for i in range(len(document_embeddings)):
                        document_embeddings[i]= model_d2v.docvecs[i]
                    # evaluate the model
                    X_train, X_test, y_train, y_test = train_test_split(document_embeddings, df['goal_num'], test_size=0.33, random_state=42)
                    clf = RandomForestClassifier(max_depth=2, random_state=123)
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_test)
                    f1score.append(f1_score(y_test, y_pred, average='weighted'))

model_score = pd.DataFrame({'model' : model,'f1score' : f1score})
model_score

  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_em

  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_em

  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_em

  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]
  document_embeddings[i]= model_d2v.docvecs[i]


Unnamed: 0,model,f1score
0,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:20",0.0
1,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:100",0.000938
2,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:500",0.00061
3,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:1000",0.000787
4,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:20",0.0
5,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:100",0.0
6,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:500",0.000542
7,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:1000",0.00053
8,"vector_size:100,alpha:0.02,min_count:3,dm:0,epochs:20",0.0
9,"vector_size:100,alpha:0.02,min_count:3,dm:0,epochs:100",0.000678


In [193]:
model_score

Unnamed: 0,model,f1score
0,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:20",0.0
1,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:100",0.000938
2,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:500",0.00061
3,"vector_size:100,alpha:0.02,min_count:1,dm:0,epochs:1000",0.000787
4,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:20",0.0
5,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:100",0.0
6,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:500",0.000542
7,"vector_size:100,alpha:0.02,min_count:1,dm:1,epochs:1000",0.00053
8,"vector_size:100,alpha:0.02,min_count:3,dm:0,epochs:20",0.0
9,"vector_size:100,alpha:0.02,min_count:3,dm:0,epochs:100",0.000678
