# Grouping similar words

In [11]:
import pandas as pd
import numpy as np

In [4]:
emo = pd.read_csv('./out/emotions.csv', low_memory=False)

In [9]:
emo = emo.replace(np.nan:'')
emo[emo['macro'] == ''].shape

(0, 9)

In [10]:
emo['macro'].unique()

array(['Enojo', 'Repulsión', 'Miedo', 'Tristeza', 'Alegría', nan,
       'Sorpresa'], dtype=object)

In [3]:
%%time
from gensim.models.wrappers import FastText
from nltk.stem import PorterStemmer
from gensim import corpora, models

from core.utils import get_closest_vector
from sklearn.cluster import DBSCAN

import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np
import unidecode
import umap
import nltk

%load_ext autoreload
%autoreload 2

CPU times: user 13.4 s, sys: 2.67 s, total: 16.1 s
Wall time: 15.1 s


### Reading fast-text bin file

In [4]:
wordvectors_file = './files/fasttext-sbwc.bin'

In [None]:
%%time
wordvector_bin = FastText.load_fasttext_format(wordvectors_file)

### Reading data

In [4]:
emotions = pd.read_excel('./data/emotions.xlsx', engine='openpyxl')

In [5]:
emo_vector = emotions['name'].values

### Initial Clustering

In [6]:
mapper = umap.UMAP()
transform = wordvector_bin[emo_vector]
reduced = mapper.fit_transform(transform)

In [1]:
clustering = DBSCAN(eps=0.3, min_samples=2).fit(reduced)

NameError: name 'DBSCAN' is not defined

In [2]:
fig = px.scatter(x=reduced[:, 0], y=reduced[:, 1], hover_name=emo_vector, color=clustering.labels_)
fig.show()

NameError: name 'px' is not defined

## Similar words

In [12]:
standard_emotions = pd.read_csv('./files/word_emo.csv')

In [13]:
emotions_groups = standard_emotions.groupby('clase')

In [14]:
standard_emotions['clase'].unique()

array(['tristeza', 'alegria', 'enfado', 'amor', 'miedo', 'sorpresa',
       'asco', 'asco '], dtype=object)

In [None]:
wordvector_bin.most_similar_cosmul('amor')

#### Steamming

In [11]:
from nltk import SnowballStemmer
from difflib import get_close_matches
spanishstemmer=SnowballStemmer('spanish')

In [12]:
roots = [spanishstemmer.stem(x) for x in emo_vector]

In [13]:
wordemo['emoroot'] = wordemo['emocion'].apply(lambda x: spanishstemmer.stem(x))

In [None]:
wordemo

In [27]:
labels = []
for r in emo_vector:
    closest = get_close_matches(r, wordemo['emocion'].values, cutoff=0.9, n=1)
    if len(closest)>0:
        labels.append(wordemo[wordemo['emocion'] == closest[0]]['clase'].values[0])
    else:
        labels.append('NR')

In [29]:
df = pd.DataFrame()

df['name'] = emo_vector
df['macro'] = labels

In [33]:
df[df['macro'] == 'NR']

Unnamed: 0,name,macro
7,incertidumbre,NR
11,confuso,NR
15,perder,NR
16,empleo,NR
17,molesto,NR
...,...,...
4071,especialistas,NR
4072,comparacion,NR
4073,meses,NR
4074,saliendo,NR


In [17]:
df2 = df[df['macro'] != 'NR']

In [18]:
mapper = umap.UMAP()
nr_names = df[df['macro'] == 'NR']['name'].values 
nr_names = df['name']

transformed_nr = wordvector_bin[nr_names]
reduced = mapper.fit_transform(transformed_nr)

In [19]:
clustering = DBSCAN(eps=0.2, min_samples=2).fit(reduced)

In [20]:
fig = px.scatter(x=reduced[:, 0], y=reduced[:, 1], hover_name=nr_names, color=df['macro'])#clustering.labels_)
fig.show()

In [21]:
from sklearn.decomposition import KernelPCA

In [22]:
transformer = KernelPCA(n_components=2, kernel='rbf')
reduced_2 = transformer.fit_transform(transformed_nr)

In [23]:
fig = px.scatter(x=reduced_2[:, 0], y=reduced_2[:, 1], hover_name=nr_names, color=clustering.labels_)
fig.show()

#### Characterizing the emotions

In [54]:
pivots = []
names = []
for name, frame in emogroups:
    vector = wordvector_bin[frame['emocion'].values]
    mean = np.mean(vector, 0)
    pivots.append(mean)
    names.append(name)

#### Getting closest pivot vector

In [80]:
rearranged = get_closest_vector(pivots, names, emo_vector, wordvector_bin)

In [84]:
words_by_emo = rearranged.groupby('label')

In [93]:
reduced_spaces = dict()
for k, frame in words_by_emo:
    vector = wordvector_bin[frame['word'].values]
    mapper = umap.UMAP()
    reduced = mapper.fit_transform(vector)
    reduced_spaces[k] = {'embedding':reduced, 'words':frame['word'].values}

In [94]:
reduced_spaces.keys()

dict_keys(['alegria', 'amor', 'asco ', 'enfado', 'miedo', 'sorpresa', 'tristeza'])

In [97]:
re = reduced_spaces['alegria']['embedding']
wo = reduced_spaces['alegria']['words']
fig = px.scatter(x=re[:, 0], y=re[:, 1], hover_name=wo)
fig.show()