In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.plotly as py
import plotly.tools as tls

In [None]:
data = pd.read_csv("//Users/carolinetroude/Desktop/DEFINITIVE-BDD-DCJ.csv",sep=";",encoding="utf-8",engine="python")

In [None]:
data.head(50)

In [None]:
#import english library
from stop_words import get_stop_words

stop_en = get_stop_words('en')
print(stop_en)

In [None]:
#erase rows with missing comments
data["VALID"] = data["COMMENT"].isnull()
data=data[~data["VALID"]]

In [None]:
set(list(data["COMMENT"].apply(lambda x: type(x))))

In [None]:
#function to clean the text
def clean_text(x):
    words = x.lower().split()
    words = [mot for mot in words if not mot in stop_en]
    words = [mot for mot in words if len(mot)>1]
    return words

data["CLEANED_TEXT"] = data["COMMENT"].apply(clean_text)

In [None]:
data.groupby(["GRADE"])["CLEANED_TEXT"].head()

In [None]:
data.groupby(["GRADE"])["COMMENT"].apply(lambda x: "".join(x)).reset_index()

In [None]:
#apply clean text function on every comment
commentaires_par_grade = data.groupby(["GRADE"])["COMMENT"].apply(lambda x: "".join(x)).reset_index()
commentaires_par_grade["CLEANED_TEXT"] = commentaires_par_grade["COMMENT"].apply(clean_text)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

#Convert all the required text into a single string here 
#and store them in word_string

#you can specify fonts, stopwords, background color and other options

wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          width=1700,
                          height=1000
                         ).generate(commentaires_par_grade["COMMENT"][0])


plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
sentences = data["CLEANED_TEXT"].values

In [None]:
#create LDA model using 30 topics
import gensim

dictionary = gensim.corpora.Dictionary(sentences)
corpus = [dictionary.doc2bow(sentence) for sentence in sentences]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=30)

In [None]:
#plot the topics with 10 words per topic
for i in range(30):
    print("**** Topic ",i," *****")
    print(lda.print_topic(i, topn=10))

In [None]:
topic_distrib = [lda.get_document_topics(dictionary.doc2bow(commentaire)) for commentaire in sentences]

In [None]:
nbre_docs = len(topic_distrib)

doc_topic_matrix = np.zeros((nbre_docs,35))

for i,x in enumerate(topic_distrib):
    for j,p in x:
        doc_topic_matrix[i,j]=p

In [None]:
#create the coordinates using t-SNE

from bhtsne import tsne
select_idx = sample(range(nbre_docs),19522)

tsne_proj = tsne(doc_topic_matrix[select_idx,:],perplexity=30.0)

In [None]:
proj_df = pd.DataFrame(data={"X_TSNE":tsne_proj[:,0], "Y_TSNE":tsne_proj[:,1],"COMMENT":data.iloc[select_idx]["COMMENT"].values})

In [None]:
#plot the heat map

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
%matplotlib inline

for num_topic in range(35):
    print ("**** Topic ", num_topic,np.mean(doc_topic_matrix[:,num_topic]))
    colormap = get_cmap('inferno')

    plt.style.use('ggplot')

    plt.figure(figsize=(5,5))
    plt.scatter(tsne_proj[:,0],tsne_proj[:,1],s=10,c=[colormap(x/np.max(doc_topic_matrix[select_idx,num_topic])) for x in doc_topic_matrix[select_idx,num_topic]])
    plt.show()