In [1]:
import pandas as pd
import os
import numpy as np
import openai
import matplotlib.pyplot as plt
openai.api_key = os.getenv('OPENAI_API_KEY')
from openai.embeddings_utils import cosine_similarity
%matplotlib inline

In [2]:
#Read data with embeddings
mylist = []
for chunk in  pd.read_csv('E:\Masters backup\data_with_embeddings.csv', chunksize=20000, index_col=0):
    mylist.append(chunk)

big_data = pd.concat(mylist, axis= 0)
del mylist

In [3]:
big_data.head()

Unnamed: 0,text,Company,year,n_tokens,embeddings
0,Augmented reality makes the fascination of Por...,Porsche,2016,75,"[-0.004225385375320911, 0.01583165116608143, 0..."
1,"and encourage you, to confront, challenge and ...",Porsche,2016,89,"[0.001987969968467951, -0.019381854683160782, ..."
2,"For inspiration, consider the information re g...",Porsche,2016,45,"[-0.006319205742329359, 0.0064370157197117805,..."
3,The second volume is called Performance and co...,Porsche,2016,49,"[-0.0010906282113865018, -0.025131287053227425..."
4,"The two volumes may be different, but they hav...",Porsche,2016,54,"[0.009442666545510292, 0.004147251136600971, 0..."


In [5]:
df=pd.read_csv('..\Well_formed\\formness_data.csv', index_col=0)
df.head()

Unnamed: 0,text,Company,year,Label,sim_score,Formness_score
0,Augmented reality makes the fascination of Por...,Porsche,2016,E1,0.721638,0.671054
1,"and encourage you, to confront, challenge and ...",Porsche,2016,G1,0.802026,0.1776
2,"For inspiration, consider the information re g...",Porsche,2016,G1,0.779666,0.17696
3,The second volume is called Performance and co...,Porsche,2016,E1,0.797681,0.474855
4,"The two volumes may be different, but they hav...",Porsche,2016,E1,0.750238,0.359458


In [7]:
df["embeddings"]=big_data["embeddings"]
df.head()

Unnamed: 0,text,Company,year,Label,sim_score,Formness_score,embeddings
0,Augmented reality makes the fascination of Por...,Porsche,2016,E1,0.721638,0.671054,"[-0.004225385375320911, 0.01583165116608143, 0..."
1,"and encourage you, to confront, challenge and ...",Porsche,2016,G1,0.802026,0.1776,"[0.001987969968467951, -0.019381854683160782, ..."
2,"For inspiration, consider the information re g...",Porsche,2016,G1,0.779666,0.17696,"[-0.006319205742329359, 0.0064370157197117805,..."
3,The second volume is called Performance and co...,Porsche,2016,E1,0.797681,0.474855,"[-0.0010906282113865018, -0.025131287053227425..."
4,"The two volumes may be different, but they hav...",Porsche,2016,E1,0.750238,0.359458,"[0.009442666545510292, 0.004147251136600971, 0..."


In [8]:
del big_data

In [14]:
# Convert list of strings embeddings into list of floats
def list_to_float(embedding):
    return [float(x.strip(' []')) for x in embedding.split(',')]

In [10]:
df=df[df["Formness_score"]>=0.4].reset_index(drop=True)
df["embeddings"]=df["embeddings"].apply(list_to_float)

In [16]:
# For each label select 1000 entries with highest sim_score
df_E1=df[df["Label"]=="E1"].reset_index(drop=True)
df_E1=df_E1.nlargest(1000,"sim_score").reset_index(drop=True)
df_E2=df[df["Label"]=="E2"].reset_index(drop=True)
df_E2=df_E2.nlargest(1000,"sim_score").reset_index(drop=True)
df_E3=df[df["Label"]=="E3"].reset_index(drop=True)
df_E3=df_E3.nlargest(1000,"sim_score").reset_index(drop=True)
df_E4=df[df["Label"]=="E4"].reset_index(drop=True)
df_E4=df_E4.nlargest(1000,"sim_score").reset_index(drop=True)
df_E5=df[df["Label"]=="E5"].reset_index(drop=True)
df_E5=df_E5.nlargest(1000,"sim_score").reset_index(drop=True)
df_S1=df[df["Label"]=="S1"].reset_index(drop=True)
df_S1=df_S1.nlargest(1000,"sim_score").reset_index(drop=True)
df_S2=df[df["Label"]=="S2"].reset_index(drop=True)
df_S2=df_E1.nlargest(1000,"sim_score").reset_index(drop=True)
df_S3=df[df["Label"]=="S3"].reset_index(drop=True)
df_S3=df_S3.nlargest(1000,"sim_score").reset_index(drop=True)
df_S4=df[df["Label"]=="S4"].reset_index(drop=True)
df_S4=df_S4.nlargest(1000,"sim_score").reset_index(drop=True)
df_G1=df[df["Label"]=="G1"].reset_index(drop=True)
df_G1=df_G1.nlargest(1000,"sim_score").reset_index(drop=True)

In [17]:
# finds centroid of list of embeddings
def find_centroid(embeddings):
    centroid=[]
    coordinate=0
    for j in range(0,len(embeddings[0])):
        for i in range(0,len(embeddings)):
            coordinate+=embeddings[i][j]
        coordinate=coordinate/len(embeddings)
        centroid.append(coordinate)
    return centroid

In [18]:
# Find centroid for each label
E1_centroid=find_centroid(df_E1.embeddings)
E2_centroid=find_centroid(df_E2.embeddings)
E3_centroid=find_centroid(df_E3.embeddings)
E4_centroid=find_centroid(df_E4.embeddings)
E5_centroid=find_centroid(df_E5.embeddings)
S1_centroid=find_centroid(df_S1.embeddings)
S2_centroid=find_centroid(df_S2.embeddings)
S3_centroid=find_centroid(df_S3.embeddings)
S4_centroid=find_centroid(df_S4.embeddings)
G1_centroid=find_centroid(df_G1.embeddings)

In [19]:
centroid_list=[E1_centroid,E2_centroid,E3_centroid,E4_centroid,E5_centroid,S1_centroid,S2_centroid,S3_centroid,S4_centroid,G1_centroid]
labels_list=["E1","E2","E3","E4","E5","S1","S2","S3","S4","G1"]

In [21]:
# function takes similarity scores and returns sorted list of labels and coresponding scores
def sorted_embedings(sim):
    results_list=[]       
    for i in range(0,10):
        results_list.append([labels_list[i],sim[i]])
    results_list.sort(key = lambda i: i[1],reverse = True)
    return results_list

In [22]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [23]:
#find cosine similarity to each label centroid
centroid_labeling=[]
for i in range(0,len(df.embeddings)):
    sim = [cosine_similarity(df.embeddings.at[i], j) for j in centroid_list]
    centroid_labeling.append(sorted_embedings(sim))
    if i % 1000 == 0:
        logging.info("Rows done= {a} , {c} % done ".format(a=i,c=100*i/(len(df))))

INFO:root:Rows done= 0 , 0.0 % done 


INFO:root:Rows done= 1000 , 1.0578428468666694 % done 
INFO:root:Rows done= 2000 , 2.115685693733339 % done 
INFO:root:Rows done= 3000 , 3.1735285406000084 % done 
INFO:root:Rows done= 4000 , 4.231371387466678 % done 
INFO:root:Rows done= 5000 , 5.289214234333348 % done 
INFO:root:Rows done= 6000 , 6.347057081200017 % done 
INFO:root:Rows done= 7000 , 7.404899928066686 % done 
INFO:root:Rows done= 8000 , 8.462742774933355 % done 
INFO:root:Rows done= 9000 , 9.520585621800025 % done 
INFO:root:Rows done= 10000 , 10.578428468666695 % done 
INFO:root:Rows done= 11000 , 11.636271315533364 % done 
INFO:root:Rows done= 12000 , 12.694114162400034 % done 
INFO:root:Rows done= 13000 , 13.751957009266704 % done 
INFO:root:Rows done= 14000 , 14.809799856133372 % done 
INFO:root:Rows done= 15000 , 15.867642703000042 % done 
INFO:root:Rows done= 16000 , 16.92548554986671 % done 
INFO:root:Rows done= 17000 , 17.98332839673338 % done 
INFO:root:Rows done= 18000 , 19.04117124360005 % done 
INFO:root:R

In [24]:
df["centroid_label"]=centroid_labeling

In [25]:
# extract top label and top score
centriod_labeling_top_label=[]
centriod_labeling_top_score=[]
for i in range(0,len(centroid_labeling)):
    centriod_labeling_top_label.append(centroid_labeling[i][0][0])
    centriod_labeling_top_score.append(centroid_labeling[i][0][1])

In [26]:
df["centriod_top_label"]=centriod_labeling_top_label
df["centriod_labeling_top_score"]=centriod_labeling_top_score

In [27]:
df.head()

Unnamed: 0,text,Company,year,Label,sim_score,Formness_score,embeddings,centroid_label,centriod_top_label,centriod_labeling_top_score
0,Augmented reality makes the fascination of Por...,Porsche,2016,E1,0.721638,0.671054,"[-0.004225385375320911, 0.01583165116608143, 0...","[[S3, 0.8203358062811105], [S4, 0.817445893605...",S3,0.820336
1,The second volume is called Performance and co...,Porsche,2016,E1,0.797681,0.474855,"[-0.0010906282113865018, -0.025131287053227425...","[[S4, 0.8579721080134736], [S3, 0.854925281773...",S4,0.857972
2,"For the first time, Porsche has published a co...",Porsche,2016,E1,0.781439,0.600305,"[-0.004911450203508139, -0.010176289826631546,...","[[S4, 0.863545253905563], [S3, 0.8587255541630...",S4,0.863545
3,"Over the course of the last financial year, Po...",Porsche,2016,G1,0.746018,0.455402,"[-0.006464285310357809, -0.004015290178358555,...","[[S4, 0.8436447845652316], [S3, 0.839527532175...",S4,0.843645
4,power of new technology. New variants of the 9...,Porsche,2016,E1,0.752404,0.439552,"[-0.0030389458406716585, -0.02243526093661785,...","[[E4, 0.8275159167501034], [S3, 0.823429067262...",E4,0.827516


In [37]:
df = df.rename(columns={'Label': 'adda_Label', 'centroid_label': 'cnetroid_label_list', 'centriod_top_label': 'centroid_label','centriod_labeling_top_score': 'centroid_score'})
df.head()

Unnamed: 0,text,Company,year,adda_Label,sim_score,Formness_score,embeddings,cnetroid_label_list,centroid_label,centroid_score
0,Augmented reality makes the fascination of Por...,Porsche,2016,E1,0.721638,0.671054,"[-0.004225385375320911, 0.01583165116608143, 0...","[[S3, 0.8203358062811105], [S4, 0.817445893605...",S3,0.820336
1,The second volume is called Performance and co...,Porsche,2016,E1,0.797681,0.474855,"[-0.0010906282113865018, -0.025131287053227425...","[[S4, 0.8579721080134736], [S3, 0.854925281773...",S4,0.857972
2,"For the first time, Porsche has published a co...",Porsche,2016,E1,0.781439,0.600305,"[-0.004911450203508139, -0.010176289826631546,...","[[S4, 0.863545253905563], [S3, 0.8587255541630...",S4,0.863545
3,"Over the course of the last financial year, Po...",Porsche,2016,G1,0.746018,0.455402,"[-0.006464285310357809, -0.004015290178358555,...","[[S4, 0.8436447845652316], [S3, 0.839527532175...",S4,0.843645
4,power of new technology. New variants of the 9...,Porsche,2016,E1,0.752404,0.439552,"[-0.0030389458406716585, -0.02243526093661785,...","[[E4, 0.8275159167501034], [S3, 0.823429067262...",E4,0.827516


In [38]:
df.to_csv("centroid_data.csv",columns=["text","Company","year","adda_Label","sim_score","centroid_label","centroid_score","Formness_score"])