In this notebook, we embed sentences and find similarity with the **reference** sentences (ODI scales). 

In [None]:
DATAROOT = '../data/'
EMBEDDINGROOT = '../data/embeddings/'

In [None]:
import pandas as pd
import numpy as np

# utils has several different functions we use throughout the notebook
from utils import read_config

In [None]:
construct = 'odi' # for now, we have three options a) sustainability, b) odi, c) boredom

filename = {'odi' : "ODI/ODI",
           }

reference = pd.read_csv(DATAROOT + "%s.tsv" %filename[construct], sep = "\t")

category = "odi_boredom_" # blank for sustainability, "odi_boredom" for odi_boredom

In [None]:
config = read_config()
final_goals = config['COUNTS']
category_shorthands = config['SHORTHANDS']
construct_references = config['REFERENCES']
definition = config['DEFINITIONS'][construct]
reference_name = construct_references[construct]
final_goals

In [None]:
"""
embed_params is a python where you can use the following functions to embed sentences 
and find the cosine similarity between embeddings
"""
from embed_params import embed, find_similarity

In [None]:
reference_embeddings = embed(reference[definition].values, embedding_type = 'sbert')

In [None]:
# example
random_sentences = ['i\'m really sad about everything']
random_embeddings = embed(random_sentences, embedding_type = 'sbert')

In [None]:
for i in range(0, len(reference_embeddings)):
    print(reference[definition].values[i], find_similarity(random_embeddings, reference_embeddings[i]))

In [None]:
review = "review_us_master"
data = pd.read_csv(DATAROOT + review + ".csv")

In [None]:
len(data)

In [None]:
"""
split into sentences and create dataframe with the structure
pro_sent_id, pro, pro_sent, company_id

(this is already done and saved, so just reload the sentences)
"""


import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sents = {}

sent_data = {}

for text in ['pros', 'cons']:
    sents[text] = []
    sent_data[text] = pd.read_csv(DATAROOT + "%s_%s_sentences.csv" %(review, text),
                           sep = "\t")#.sample(500).reset_index()
    sent_data[text] = sent_data[text].dropna().reset_index()

In [None]:
len(sent_data['pros']), len(sent_data['cons'])

In [None]:
sent_data['pros'].head(10)

In [None]:
## do the embedding (no need to do it again)
# for text in ['pros', 'cons']:
#     sent_data[text]["%s_sent_embedded" %text] = list(embed(sent_data[text]['%s_sent' %text],
#                                                            embedding_type = "sbert"))

In [None]:
# embed once and save

embedding_type = 'sbert'

# for text in ['pros', 'cons']:
#     with open(EMBEDDINGROOT+"/%s_%s_%s_sent_embeddings.npy" %(filename, text, embedding_type), 'wb') as f:
#         np.save(f, sent_data[text]["%s_sent_embedded" %text])

In [None]:
# load embeddings
for text in ['pros', 'cons']:
    sent_data[text]["%s_sent_embedded" %text] = list(np.load(EMBEDDINGROOT + \
                                                             'review_us_master_%s_%s_sent_embeddings.npy' %(text,
                                                                                                          embedding_type),
                                                            allow_pickle = True))


In [None]:
"""
Find the similarity with the reference data, also already done, so no need to run this, simply load the similarity
"""

# from utils import find_similarity

# # do once and save
# for text in ['pros', 'cons']:
#     for i in final_goals[construct]:
#         sent_data[text]['%d_sim_1' %(i)] = find_similarity(sent_data[text]["%s_sent_embedded" %text].values,
#                                                reference_embeddings[i])
        
# # drop embeddings and save
# for text in ['pros', 'cons']:
#     sent_data[text] = sent_data[text].drop("%s_sent_embedded" %text, axis = 1)
#     sent_data[text].to_csv(DATAROOT+"intermediate/%s_%s_sent_embedded.csv" %(construct, text),
#                            sep = "\t", index = False)        

In [None]:
# load already saved similarity
for text in ['pros', 'cons']:
     sent_data[text] = pd.read_csv(DATAROOT+"intermediate/%s_%s_sent_embedded.csv" %(construct, text),
                                   sep = "\t")

In [None]:
cols = ['%d_sim_1' %i for i in final_goals[construct]]


In [None]:
cols

In [None]:
sent_data['pros'][cols].mean()

In [None]:
sent_data['cons'][cols].mean()

In [None]:
i = 2
text = 'cons'


print(reference[reference_name].values[i])

sent_data[text] = sent_data[text].drop_duplicates('%s_sent' %(text), keep = 'first')
sent_data[text].sort_values('%d_sim_1' %(i), ascending = False)[['%s_sent' %(text),
                                                                      '%d_sim_1' %(i)]].values[0:10]

In [None]:
#from utils import plot_sim_dist_odi

import numpy as np
import pandas as pd
import timeit
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

from datetime import datetime
from dateutil import parser

from sklearn.preprocessing import MinMaxScaler
from math import e



In [None]:
sent_data['cons']

In [None]:
from utils import plot_sim_dist_odi
import math

ncols = math.ceil(len(final_goals[construct]) / 3)

plot_sim_dist_odi(sent_data, category_shorthands, construct, reference, reference_name,
                  nrows = 3, ncols = ncols, which_goals = final_goals[construct])

In [None]:
data = sent_data['cons'].copy()
threshold = data['3_sim_1'].quantile(0.95)
data_ = data[data["3_sim_1"] > threshold]

In [None]:
len(data_), len(data), len(data_)/len(data), threshold

In [None]:
# what is the average of the 95th percentile
threshold_sum = 0
for i in cols:
    threshold_sum += sent_data[text][i].quantile(0.95)
threshold_sum/len(cols)

In [None]:
# FIRST MANUAL VALIDATION: make a table with the top ten reviews and their sim score for each ODI and boredom sentence

In [None]:
# drop duplicates

for text in['pros', 'cons']:
    sent_data[text] = sent_data[text].drop_duplicates(subset = ['%s_sent' %text], keep = 'first')

In [None]:
texts = ['pros', 'cons']

categories = []
refs = []
sents = {'pros' : [], 'cons' : []}
sent_scores = {'pros' : [], 'cons' : []}



for i in final_goals[construct]:
    categories.extend([construct]*10)
    refs.extend([reference[reference_name].values[i]] * 10)
    for text in ['pros', 'cons']:
        sents[text].extend(sent_data[text].sort_values('%d_sim_1' %(i),
                                            ascending = False)['%s_sent' %(text)].values[0:10])
        sent_scores[text].extend(sent_data[text].sort_values('%d_sim_1' %(i),
                                                  ascending = False)['%d_sim_1' %(i)].values[0:10])
    


In [None]:
len(categories), len(refs), len(sents['pros']), len(sents['cons'])

In [None]:
manual_data = pd.DataFrame({"category": categories,
             "reference text": refs,
             "con" : sents['cons'],
             "con score" : sent_scores['cons'],
             "pro" : sents['pros'],
             "pro score" : sent_scores['pros']}
             )

In [None]:
manual_data

In [None]:
#manual_data.to_csv(DATAROOT + "intermediate/pre_manual_validation_ODI_BOREDOM_stress_top_10.tsv", sep = "\t", index = False)

In [None]:
# SECOND MANUAL VALIDATION: make a table, for each sentence, with 5 sentences sampled from different threshold bands

In [None]:
# get data by thresholds
# output: threshold_dict ---> text ---> goal ---> sim ---> threshold

def shortlist_by_threshold(data, threshold, simfield = '_sim_1', num = 17):
    thresholded_data = []
    for num in range(0, num):
        upper_threshold = threshold + 0.05
        lower_threshold = threshold 
        thresholded_data.append(data[(data[str(num)+simfield] < upper_threshold) & (data[str(num)+simfield] > lower_threshold)])
    return thresholded_data   

In [None]:
percentile_dict = {}

percentiles = [0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1]
sim1_only = ["_sim_1"]

# initialize dict
for text in texts:
    percentile_dict[text] = {}
    for percentile in percentiles:
        percentile_dict[text][percentile] = {}

for text in ['pros', 'cons']:
    for percentile in percentiles:
        for sim in sim1_only:
            percentile_dict[text][percentile][sim] = shortlist_by_threshold(sent_data[text],
                                                                             percentile, sim,
                                                                             num = len(final_goals[construct]))

In [None]:
goal = 0
to_save = {}
# save to single dataframe with the following: company_id, text, percentile
for text in ['pros', 'cons']:
    to_save[text] = pd.DataFrame()
    for percentile in percentiles:
        try:
            data = percentile_dict[text][percentile]['_sim_1'][goal].sample(5)
            data = data[['company_id', text+"_sent", "%d_sim_1" %goal]]
            data['upper bound'] = [percentile+0.05] * 5
            data['lower bound'] = [percentile] * 5
            to_save[text] = to_save[text].append(data)
        except:
            pass
#         to_save[text].to_csv(DATAROOT + "intermediate/manual_validation_by_threshold_%s_%s.csv" %(text, sustainability['Goal'][goal]), 
#                              sep = "\t", index = False)

In [None]:
to_save['cons']