In [None]:
import pandas as pd
import numpy as np

import re

import matplotlib.pyplot as plt, mpld3

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import spacy

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        try:
          doc = nlp(" ".join(sent)) 
          texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        except:
          pass
    return texts_out


In [None]:
def get_cleaned_data(csv_source_dict):
    all_texts = []

    for key in csv_source_dict.keys():
        source_df = pd.read_csv(csv_source_dict[key]['path_to_csv'])
        source_columns = csv_source_dict[key]['text_columns'].split(',')
        
        for column in source_columns:
            all_texts.extend(source_df[column.strip()].unique().tolist())
    
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in all_texts]
    
    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    
    
    data_words = list(sent_to_words(data))
    
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases
    
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
    
    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    # Create Corpus
    return data_lemmatized

In [None]:
def check_vocab_for_keywords(keywords, model):
    all_vocab = []

    for index, word in enumerate(model.wv.index_to_key):
        all_vocab.append(word)
        
    for index, keyword in enumerate(keywords):
        if keyword not in all_vocab:
            keywords.pop(index)
            print(keyword, "is not in the vocabulary and has been removed from the keyword array")
    
    return keywords

In [None]:
def get_plot_values(keywords, model, plot_type='pca'):
    labels = []
    tokens = []
    
    x = []
    y = []
    
    plot_values = ''
    
    for keyword in keywords:
        tokens.append(model.wv[keyword])
        labels.append(keyword)
        
    if plot_type == 'pca':
        pca_model = PCA(random_state=23, n_components=2)
        plot_values = pca_model.fit_transform(tokens)
    elif plot_type == 'tsne':
        tsne_model = TSNE(random_state=23, n_components=2)
        plot_values = tsne_model.fit_transform(tokens)
        
    for value in plot_values:
        x.append(value[0])
        y.append(value[1])
        
    return x, y, labels
        

In [None]:
def view_plot_in_notebook(x, y, labels):
    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
            xy=(x[i], y[i]),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')

    plt.show()

In [None]:
def save_plot_to_html(x, y, labels, tooltip='no', filename='keyword-graph.html'):
    
    if tooltip == 'yes':
        fig, ax = plt.subplots(figsize=(15,7.5))

        scatter = ax.scatter(x, y, s=80)

        tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)

        mpld3.plugins.connect(fig, tooltip)
        
        mpld3.save_html(fig, filename)
        
    else:
        fig, ax = plt.subplots(figsize=(15,7.5))
        
        for i in range(len(x)):
            ax.scatter(x[i],y[i])
            ax.annotate(labels[i],
                xy=(x[i], y[i]),
                xytext=(x[i] - len(labels[i]), y[i] + 4))
            
        mpld3.save_html(fig, filename)
    

## Supply the csv path, and text columns (comma separated)

In [None]:
csv_source_dict = {
    'source': {
        'path_to_csv': '<path-to-csv>'
        'text_columns': '<text-column>'
    }
}

keywords = ['keyword1', 'keyword2']




In [None]:
# According to https://stackoverflow.com/questions/54573853/nltk-available-languages-for-stopwords
# As of 2020 these languages are supported:

# 'arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek',
# 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian',
# 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish'

stop_words = stopwords.words('<language>')
# stop_words = stopwords.words('portuguese')


# Official documentation: https://spacy.io/models
# Supporting the following langauges:

# 'Chinese', 'Danish', 'Dutch', 'English', 'French', 'German', 'Greek', 'Italian', 'Japanese',
# 'Lithuanian', 'Norwegian Bokmål', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Spanish'

nlp = spacy.load('<language-model-name>')
# nlp = spacy.load('pt_core_news_lg')

In [None]:
texts = get_cleaned_data(csv_source_dict)

In [None]:
model = Word2Vec(texts,min_count=1,workers=3,window=3,sg=1)

In [None]:
keywords = check_vocab_for_keywords(keywords, model)

## The following cell can also be plotted as PCA, if TSNE is takes up too many resources to run on your local machine.

In [None]:
x, y, labels = get_plot_values(keywords, model, 'tsne')

In [None]:
view_plot_in_notebook(x, y, labels)

## You can use the filename parameter to change the html filename, otherwise it will save as keyword-graph.html

In [None]:
save_plot_to_html(x, y, labels)

## Use the cell below if you want to look at all of the vocabulary in the supplied text

In [None]:
for index, word in enumerate(model.wv.index_to_key):
    print(index, word)