In [1]:
import pandas as pd
import os
import re
import spacy
from gensim.models.phrases import Phrases, Phraser
from time import time 
import multiprocessing
from gensim.models import Word2Vec
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import scale
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

Using TensorFlow backend.


In [2]:
data = pd.read_csv("../data/Suicides_wikipedia_scrapping_fr.csv", encoding="utf-8-sig", delimiter=",")
data.head()

Unnamed: 0,Name,Link,Content
0,Adolfo_Couve,https://fr.wikipedia.org/wiki/Adolfo_Couve,"['id=""Biographie"">Biographie[modifier', '|', '..."
1,Boaz_Arad,https://fr.wikipedia.org/wiki/Boaz_Arad,"['id=""Biographie"">Biographie[modifier', '|', '..."
2,Ralph_Barton,https://fr.wikipedia.org/wiki/Ralph_Barton,"['id=""Biographie"">Biographie[modifier', '|', '..."
3,Mark_Campos,https://fr.wikipedia.org/wiki/Mark_Campos,"['id=""Biographie"">Biographie[modifier', '|', '..."
4,Michel_Caron_(artiste),https://fr.wikipedia.org/wiki/Michel_Caron_(ar...,"['id=""Biographie"">Biographie[modifier', '|', '..."


In [4]:
Biography = data.Content.values.tolist()
# Remove Emails
Biography = [re.sub('\S*@\S*\s?', '', sent) for sent in Biography]

# Remove new line characters
Biography = [re.sub('\s+', '', sent) for sent in Biography]

# Remove distracting single quotes
Biography = [re.sub("\'", "", sent) for sent in Biography]

# Remove the commas
Biography = [re.sub(",", " ", sent) for sent in Biography]

In [5]:
Biography_clean_brackets = ['']*len(Biography)
for i in range(len(Biography)):
    for j in range(69, len(Biography[i]) - 7):
        Biography_clean_brackets[i]+=Biography[i][j]

In [6]:
Biography_clean_brackets[2]

' Kansas City (Missouri)  né "dun" père avocat et "dune" mère artiste-peintre  Ralph Barton se révèle très jeune doué pour le dessin.Il débute comme caricaturiste pour The Kansas City Star et The Kansas City Journal-Post  puis abandonne ses études  et part pour Chicago en 1909 tenter "lInstitut" "dart" de Chicago. Il renonce à vivre dans cette ville et revient à Kansas City pour se remarier avec Marie Jennings  sa première épouse.En 1910  il commence à contribuer au magazine satirique Puck ; il déménage à New York avec sa femme et leur fille  poursuit son travail pour Puck et "McCalls." Il loue ensuite un atelier "quil" partage avec le peintre Thomas Hart Benton "cest" Benton qui sert de premier modèle aux caricatures de Barton qui vont connaître un certain succès.En 1915  Puck décide "denvoyer" Barton en mission de reportage dans le Paris en guerre il "sattache" au pays  à la Ville lumière  et durant toute sa vie  y fera de fréquents séjours. Le 30 juin 1927  il est nommé chevalier de

In [8]:
df_clean = pd.DataFrame(Biography_clean_brackets)
df_clean.head()

Unnamed: 0,0
0,""" Couve Braga et Clemencia Rioseco Fernández ..."
1,stique[modifier | modifier le code]Artiste mul...
2,"Kansas City (Missouri) né ""dun"" père avocat ..."
3,est vide insuffisamment détaillée ou incompl...
4,Michel Caron veut devenir comédien. Il entre...


In [12]:
t = time()


tokenizer = RegexpTokenizer(r'\w+')
df_clean[0] = df_clean[0].astype('str') 
df_clean.dtypes

print('Time to tokenize everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to tokenize everything: 0.0 mins


In [13]:
t = time()

df_clean["tokens"] = df_clean[0].apply(tokenizer.tokenize)
# delete Stop Words

print('Time to tokenize everything: {} mins'.format(round((time() - t) / 60, 2)))
df_clean.head()

Time to tokenize everything: 0.0 mins


Unnamed: 0,0,tokens
0,""" Couve Braga et Clemencia Rioseco Fernández ...","[Couve, Braga, et, Clemencia, Rioseco, Fernánd..."
1,stique[modifier | modifier le code]Artiste mul...,"[stique, modifier, modifier, le, code, Artiste..."
2,"Kansas City (Missouri) né ""dun"" père avocat ...","[Kansas, City, Missouri, né, dun, père, avocat..."
3,est vide insuffisamment détaillée ou incompl...,"[est, vide, insuffisamment, détaillée, ou, inc..."
4,Michel Caron veut devenir comédien. Il entre...,"[Michel, Caron, veut, devenir, comédien, Il, e..."


In [14]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer, important for a parameter of the model
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [15]:
t = time()
w2v_model.build_vocab(df_clean["tokens"], progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.0 mins


In [16]:
t = time()
w2v_model.train(df_clean["tokens"], total_examples=w2v_model.corpus_count, epochs=5, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.06 mins


In [22]:
w2v_model.wv.most_similar(positive=["meurt"])

[('2006', 0.9998602867126465),
 ('2', 0.9998579025268555),
 ('1992', 0.9998505711555481),
 ('1968', 0.9998490810394287),
 ('13', 0.9998484253883362),
 ('marie', 0.9998481273651123),
 ('1969', 0.9998478889465332),
 ('16', 0.9998430013656616),
 ('1998', 0.9998419284820557),
 ('décès', 0.9998412728309631)]

In [23]:
# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [w2v_model[w] for w in list(w2v_model.wv.vocab.keys())[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(w2v_model.wv.vocab.keys())[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

  


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4103 samples in 0.090s...
[t-SNE] Computed neighbors for 4103 samples in 10.713s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4103
[t-SNE] Computed conditional probabilities for sample 2000 / 4103
[t-SNE] Computed conditional probabilities for sample 3000 / 4103
[t-SNE] Computed conditional probabilities for sample 4000 / 4103
[t-SNE] Computed conditional probabilities for sample 4103 / 4103
[t-SNE] Mean sigma: 0.008104
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.176750
[t-SNE] KL divergence after 1000 iterations: 1.317510
