In [None]:
! python --version

### Import libs

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import word2vec
import multiprocessing
import sklearn
import spacy

import pandas as pd
import random
import numpy as np
import unicodedata
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import logging

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn.model_selection import StratifiedShuffleSplit
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
print(gensim.__version__)

### Functions to Clean Text

In [None]:
stop_words = stopwords.words('portuguese')
stop_words.pop(stop_words.index('não'))
new_stopwords = ('bom', 'dia', 'ola', 'eu')
for i in new_stopwords:
    stop_words.append(i)
    
nlp = spacy.load("pt_core_news_sm")

def anonymizer(text,stop_words):
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(' +', ' ', str(text).lower())
    text = re.sub('((http?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*', ' ', str(text).lower())
    text = re.sub('\S+@\S+', ' ', str(text).lower())
    text = re.sub('@\S+', ' ', str(text).lower())
    text = re.sub('\d', ' ', str(text).lower())
    text = re.sub('https:\S+', ' ', str(text).lower())
    text = re.sub('[^a-z0-9 ]+', ' ', str(text).lower())
    text = nltk.tokenize.word_tokenize(text, language='portuguese')
    return ' '.join([w for w in text if not w in stop_words])

### Analise de score do texto

### Converter as frases em um dataset pandas

In [None]:
dados = {'indices_id': ['0','1','2','3'],
         'Frase original': ['Olhando para a escala na parede, qual valor indicaria melhor a sua dor hoje?',
                           'Olhando para a escala na parede, qual valor indicaria melhor a sua dor hoje?',
                           'Olhando para a escala na parede, qual valor indicaria melhor a sua dor hoje?',
                           'Olhando para a escala na parede, qual valor indicaria melhor a sua dor hoje?'],
         'Frases comparativas': ['De acordo com a escala de dor ali na parede', 
                                'qual valor você acha que mais representa a sua dor?',
                                'De 0 a 10, qual o nível de intensidade da sua dor atualmente?',
                                'Qual a intensidade da sua dor?'],
         }

In [None]:
dataset = pd.DataFrame(dados)

In [None]:
dataset

In [None]:
dataset.shape

In [None]:
dataset['frase_original_clean'] = dataset['Frase original'].apply(lambda x: anonymizer(x, stop_words))
dataset['Frases_comparativas_clean'] = dataset['Frases comparativas'].apply(lambda x: anonymizer(x, stop_words))

### Comparative original column with column clean

In [None]:
dataset[['Frase original','frase_original_clean', 'Frases comparativas','Frases_comparativas_clean']]

### Creating tokens and training model Word2Vec

In [None]:
def build_corpus(data):
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)
    return corpus

def createModel_word2vec(text,size=None,min_count=None,window=None):
    model = gensim.models.word2vec.Word2Vec(text,min_count=min_count,window=window,workers=10)
    return model

In [None]:
dataset["Frases_comparativas_clean_wc"] = build_corpus(dataset['Frases_comparativas_clean'])
dataset["frase_original_clean_wc"] = build_corpus(dataset['frase_original_clean'])

In [None]:
dataset[['frase_original_clean_wc',"Frases_comparativas_clean_wc"]]

### Concat columns and convert all words in vector

In [None]:
dataset["concat"] = dataset["frase_original_clean_wc"] + dataset["Frases_comparativas_clean_wc"]

In [None]:
dataset['concat']

### Training model

In [None]:
model_wc = createModel_word2vec(dataset['concat'],min_count=1,window=10)

In [None]:
type(model_wc)

### Visualize Vocabulary

In [None]:
model_wc.wv.vocab

In [None]:
[x for x in model_wc.wv.vocab]

### Get similar words

In [None]:
[(item[0],round(item[1],2)) for item in model_wc.most_similar('dor')]

In [None]:
keys = [x for x in model_wc.wv.vocab]
embedding_clusters = []
word_clusters = []
for word in keys:
    print(word)
    embeddings = []
    words = []
    for similar_word, _ in model_wc.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model_wc[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)
    
    model_wc.most_similar

### Word Embedding

In [None]:
from sklearn.manifold import TSNE
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=10, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

### Plot with similar words

In [None]:
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


    tsne_plot_similar_words('Similar words from text', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

In [None]:
w2v_vocab = set(model_wc.wv.vocab)
print("Loaded {} words in vocabulary".format(len(w2v_vocab)))

### Make a similarity matrix for words and visualise

In [None]:
words = [x for x in model_wc.wv.vocab]
similarities = np.zeros((len(words), len(words)), dtype=np.float_)
for idx1, word1 in enumerate(words):
    for idx2, word2 in enumerate(words):
        # note KeyError is possible if word doesn't exist
        sim = model_wc.similarity(word1, word2)
        similarities[idx1, idx2] = sim
        
df = pd.DataFrame.from_records(similarities, columns=words)
df.index = words

In [None]:
df

In [None]:
f, ax=plt.subplots(1, 1, figsize=(14,8))
cmap = plt.cm.Blues
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df, cmap=cmap, mask=mask, square=True, ax=ax)
_=plt.yticks(rotation=90)
plt.xlabel('Words')
_=plt.xticks(rotation=45)
_=plt.title("Similarities between words")

### Score a target sentence to source sentences

In [None]:
#target_sentence = "You'd love to drink a cool refreshing Coke"
target_sentence = ["Olhando para a escala na parede, qual valor indicaria melhor a sua dor hoje?",
                  "De acordo com a escala de dor ali na parede",
                  'qual valor você acha que mais representa a sua dor?',
                                'De 0 a 10, qual o nível de intensidade da sua dor atualmente?',
                                'Qual a intensidade da sua dor?']

In [None]:
# use n_similarity to compute a cosine similarity (should be reasonably robust)

for i in target_sentence:
    sentences = [x for x in model_wc.wv.vocab]
    sentences_similarity = np.zeros(len(sentences))

    target_sentence_words = [w for w in i.split() if w in w2v_vocab]
    for idx, sentence in enumerate(sentences):
        sentence_words = [w for w in sentence.split() if w in w2v_vocab]
        sim = model_wc.n_similarity(target_sentence_words, sentence_words)
        sentences_similarity[idx] = sim

    result = list(zip(sentences_similarity, sentences))
    result.sort(key=lambda item:item[0], reverse=True)
    print("Target:", target_sentence)
    print(result)

### Test some word relationships

In [None]:
model_wc.most_similar(positive=["acordo", "melhor"], negative=['dor', 'intensidade'])

### Project a set of words (via their 30 dimensional vector) using T-SNE

In [None]:
from sklearn.manifold import TSNE

raw_words_of_interest = [x for x in model_wc.wv.vocab]

words_of_interest = []
for woi in raw_words_of_interest:
    for word, _ in model_wc.most_similar(woi):
        words_of_interest.append(word)

words_of_interest = list(set(words_of_interest))

vectors = []
for word in set(words_of_interest):
    vectors.append(model_wc[word])
    
vectors = np.vstack(vectors) # turn vectors into a 2D array <words x 300dim>

model = TSNE(n_components=2, perplexity=10, random_state=0)
X_tsne = model.fit_transform(vectors)
df_after_tsne = pd.DataFrame.from_records(X_tsne, columns=['x', 'y'])
df_after_tsne['labels'] = words_of_interest

# calculate similarity from a target word to all words, to use as our colour
target_word = "dor"
similarities = []
for woi in words_of_interest:
    similarity = min(max(0, model_wc.similarity(target_word, woi)), 1.0)
    similarities.append(similarity)

# plot the T-SNE layout for words, darker words means more similar to our target
plt.figure(figsize=(12,8))
plt.xlim((min(X_tsne[:,0]), max(X_tsne[:,0])))
plt.ylim((min(X_tsne[:,1]), max(X_tsne[:,1])))
for idx in range(X_tsne.shape[0]):
    x, y = X_tsne[idx]
    label = words_of_interest[idx]
    color=str(min(0.6, 1.0-similarities[idx])) # convert to string "0.0".."1.0" as greyscale for mpl
    plt.annotate(s=label, xy=(x, y), color=color)
    #plt.annotate(s=label, xy=(x, y), weight=int(similarities[idx]*1000)) # use weight
plt.tight_layout()
plt.title("Word similarity (T-SNE) using vectors from {} words\nColoured by similarity to '{}'".format(len(words_of_interest), target_word))