# Grouping similar words

In [1]:
%%time
from difflib import get_close_matches
from core.utils import get_closest_vector

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

CPU times: user 816 ms, sys: 1.04 s, total: 1.85 s
Wall time: 742 ms


### Reading data

In [12]:
def clean_alt_list(list_):
    list_ = str(list_)
    list_ = list_.replace('[', '')
    list_ = list_.replace(']', '')
    list_ = list_.replace("'", '')
    list_ = list_.split(',')
    return list_

In [13]:
emotions = pd.read_csv('./data/emotions.csv', low_memory=False)

In [14]:
emotions['name_tokens'] = emotions['name_tokens'].apply(lambda x: clean_alt_list(x))

### Identifying emotions

The final `emo_vector_unicode` has a lot of words which are not related with emotions or sentiments. This is because some entries in the survey was filled with free-text. In attemp to capture all posible emotions we splitted that free text in singular words (assumming that all of them could be a potencial emotion).

Now It is time to discriminate them. We use [Spanish Emotion Lexicon](http://www.cic.ipn.mx/~sidorov/#SEL)(SEL) to filter words.

In [15]:
SEL_df = pd.read_excel('./files/SEL.xlsx', engine='openpyxl')
standard_emotions = SEL_df['Palabra']
SEL_df['Categoría'].unique()

array(['Alegría', 'Enojo', 'Miedo', 'Aversión', 'Sorpresa', 'Tristeza'],
      dtype=object)

In order to find matches between survey words and the standard ones, we use `get_close_matches` which compare words using similarity criteria.

In [16]:
%%time
emo_vector_matched = []
final_category = []
# iterate over the list of tokens
for word in emotions['name_tokens']:
    closest = []
    categories = []
    # for each word within the list (some people wrote sentences instead of a single word)
    for w in word:
        # get the closest emotion from SEL dictonary
        closest_word = get_close_matches(w.strip(), standard_emotions,n=1, cutoff=0.7)
        if closest_word != []:
            # if we match some emotion then save its category
            cat = SEL_df[SEL_df['Palabra']==closest_word[0]]['Categoría']
            categories.append(cat.values[0])
            closest.append(closest_word[0])
        else:
            continue
    
    # at the end of the process... check if the response has a category
    if closest == []:
        final_category.append('')        
        emo_vector_matched.append('')
    else:
        final_category.append(categories[0])        
        emo_vector_matched.append(closest[0])

CPU times: user 13min 11s, sys: 278 ms, total: 13min 11s
Wall time: 13min 11s


In [17]:
print('Non categorized values: {:.1f} %'.format(emotions[emotions['macro'] == ''].shape[0]/emotions.shape[0]*100))

Non categorized values: 0.0 %


In [18]:
emotions['macro'] = final_category

In [20]:
emotions.sample(3)

Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,is_online
50738,50738,enc_u_4560273938808714862,,ayudar y defender,"[ayudar, defender]",Aversión,ayudar a los derechos de los demas. defender ...,"['ayudar', 'derechos', 'demas', 'defender', 'v...",True
9324,9324,enc_u_5222441,,molestos,[molestos],Enojo,"ya que las autoridades prometen cosas, que des...","['autoridades', 'prometen', 'cosas', 'despues'...",True
72309,72309,,4255586.0,invadida,[invadida],Enojo,"con los disturbios, pesimo","['disturbios', 'pesimo']",False


In [21]:
emotions.to_csv('./out/emotions_2_sept.csv', index=False)

## TO CSV

In [11]:
%%time
emotions = emotions.to_csv('./out/emotions.csv', index=False)

CPU times: user 326 ms, sys: 8 ms, total: 334 ms
Wall time: 333 ms


In [22]:
pd.read_csv('./out/emotions_2_sept.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,diag_id,ind_id,name,name_tokens,macro,exp,exp_tokens,is_online
0,0,enc_u_4602780640112847056,,rabia,['rabia'],Enojo,por el abandono del gobierno a su gente,"['abandono', 'gobierno', 'gente']",True
1,1,enc_u_4602778880117363308,,intranquilidad,['intranquilidad'],Aversión,porque uno no sabe lo que viene mas adelante,"['sabe', 'viene', 'mas', 'adelante']",True
2,2,enc_u_4602777200118509403,,incertidumbre,['incertidumbre'],Miedo,se encontraba en santiago y no sabia si podia ...,"['encontraba', 'santiago', 'sabia', 'si', 'pod...",True
3,3,enc_u_4602666791193242253,,confuso,['confuso'],Miedo,"por inestabilidad laboral, economica, y social...","['inestabilidad', 'laboral', 'economica', 'soc...",True
4,4,enc_u_4602651983012551467,,rabia/impotencia,"['rabia', ' impotencia']",Enojo,por funcionamiento de los servicios ya que se ...,"['funcionamiento', 'servicios', 'paralizan', '...",True
...,...,...,...,...,...,...,...,...,...
83740,83740,,0509e2ef02a033fc0efab1214e68be09,triste,['triste'],Tristeza,"la situacion actual solo suma tristeza, sin im...","['situacion', 'actual', 'solo', 'suma', 'trist...",True
83741,83741,,c4960481a3875eb1cfbe32c33dc362c0,miedo,['miedo'],Miedo,la posibilidad que le pase algo a mi familia e...,"['posibilidad', 'pase', 'familia', 'alta']",True
83742,83742,,2eac9b65b2b1ec5d134d6b45c83315fe,triste,['triste'],Tristeza,,,True
83743,83743,,ddc84f9d6d278cf3d32a0df820e01688,triste,['triste'],Tristeza,triste de ver como el pais donde creci se quema,"['triste', 'ver', 'pais', 'creci', 'quema']",True
