In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_selection import SelectPercentile, f_classif

In [2]:
# Load politicals speechs in a dictionary
SPEACH_PATH = "./dataset/tous/"
speach_names = [file_name for file_name in listdir(SPEACH_PATH) if isfile(join(SPEACH_PATH, file_name))]

speaches = dict()
for file_name in speach_names:
    with open(SPEACH_PATH + file_name, encoding='utf8') as txt:
        speaches[file_name] = txt.read().replace("\n", " ")

In [3]:
# Display 40 first chars of all speeches
preview_speaches = dict()
for key, value in speaches.items():
    preview_speaches[key] = value[0:40]
    
print(preview_speaches)

{'François_Hollande_286.txt': ' Mesdames, Messieurs, chers amis, je che', 'François_Bayrou_798.txt': ' Comment douter, en vous voyant, de la r', 'Philippe_Poutou_416.txt': ' http://www.dailymotion.com/video/xq3a30', 'Arlette_Laguiller_97.txt': ' Travailleuses, travailleurs, camarades ', 'Nicolas_Sarkozy_0.txt': ' Mes chers amis, Je salue la Touraine, t', 'Nicolas_Sarkozy_937.txt': ' Mes chers amis, À quatre jours du vote,', 'Arlette_Laguiller_201.txt': ' Travailleuses, Travailleurs, camarades ', 'Ségolène_Royal_324.txt': ' Bonsoir, amis de Lyon et de toute la ré', 'François_Hollande_709.txt': ' Mes chers amis ! Merci d’être venus aus', 'Jacques_Chirac_139.txt': ' Mes chers amis, Mes chers compatriotes,', 'Nicolas_Sarkozy_402.txt': ' Mes chers amis, Toulouse, capitale de l', 'Jacques_Chirac_624.txt': ' Cher Jean-Pierre RAFFARIN, Mes chers Am', 'Ségolène_Royal_732.txt': ' Bonsoir. Chers amis, chers amis de Lorr', 'Ségolène_Royal_890.txt': ' Je vous salue ! Je vous salue peuple de', 'Ségo

In [4]:
# Count word occurences with CountVectorizer
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(speaches.values())
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names_out())

df

Unnamed: 0,000,04,07,10,100,1000,1001,1002,1003,1004,...,événements,évêque,évêques,ééquilibré,êtes,êtr,être,êtres,île,îles
0,1,0,0,2,0,0,0,0,0,0,...,0,0,0,0,3,0,27,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,23,0,0,0
2,38,0,0,5,2,0,0,0,0,0,...,0,0,0,0,1,0,10,1,0,0
3,3,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,8,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,40,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4,1,0,0
91,13,0,0,2,2,0,0,0,0,0,...,0,0,0,0,1,0,16,0,0,0
92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
93,4,0,0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,23,3,0,0


In [5]:
# Count word occurences with TfidfVectorizer
# Source : https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(speaches.values())
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()

df

Unnamed: 0,000,04,07,10,100,1000,1001,1002,1003,1004,...,événements,évêque,évêques,ééquilibré,êtes,êtr,être,êtres,île,îles
0,1,0,0,2,0,0,0,0,0,0,...,0,0,0,0,3,0,27,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,23,0,0,0
2,38,0,0,5,2,0,0,0,0,0,...,0,0,0,0,1,0,10,1,0,0
3,3,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,8,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,40,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4,1,0,0
91,13,0,0,2,2,0,0,0,0,0,...,0,0,0,0,1,0,16,0,0,0
92,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,0,0
93,4,0,0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,23,3,0,0


In [6]:
# Sort speaches by politician name
sorted_speaches = dict()
for key in sorted(speaches.keys()):
    sorted_speaches[key] = speaches[key]

In [7]:
# Define function to get speach author with filename
def get_author(filename):
    split = filename.split("_")
    politician_name = " ".join(split[:-1]).title()
    return politician_name

In [8]:
# Get all politicians
politicians = set([])
for index, key in enumerate(sorted_speaches.keys()):
    politicians.add(get_author(key))
    
politicians = list(politicians)

In [9]:
# Unsupervised learning
# Source : https://realpython.com/k-means-clustering-python/

# Preprocess data - count word frequencies with stopwords
french_stop_words = stopwords.words('french')
vectorizer = TfidfVectorizer(stop_words=french_stop_words)
vectors = vectorizer.fit_transform(sorted_speaches.values())
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
scaled_features = pd.DataFrame(denselist, columns=feature_names)

# Init KMeans
kmeans = KMeans(init="random",
                n_clusters=2, 
                n_init=10, 
                max_iter=300, 
                random_state=42)
kmeans.fit(scaled_features)

# Classify politicians by clusters
report_clusters = [[],[]]
kmeans_outputs = kmeans.labels_
for index, key in enumerate(sorted_speaches.keys()):   
    cluster_type = kmeans_outputs[index]
    cluster_list = report_clusters[cluster_type]
    cluster_list.append(get_author(key))
    report_clusters[cluster_type] = cluster_list
    
# Display cluster part for all politicians
for politician in sorted(politicians):
    part_cluster_1 = report_clusters[0].count(politician)
    part_cluster_2 = report_clusters[1].count(politician)
    if (part_cluster_1 < part_cluster_2):
        percent = part_cluster_2 / (part_cluster_1 + part_cluster_2) * 100
        print("{0} : {1}% of cluster 2".format(politician, percent)) 
    else:
        percent = part_cluster_1 / (part_cluster_1 + part_cluster_2) * 100
        print("{0} : {1}% of cluster 1".format(politician, percent))
        
# Conclusion :
# - one cluster seems to group politicians of extreme left and communist
# - other cluster seems to group all other politicians

Alain Madelin : 100.0% of cluster 2
Arlette Laguiller : 100.0% of cluster 1
Bruno Mégret : 100.0% of cluster 2
Corinne Lepage : 100.0% of cluster 2
Dominique Voynet : 100.0% of cluster 2
Eva Joly : 100.0% of cluster 2
François Bayrou : 100.0% of cluster 2
François Hollande : 100.0% of cluster 2
Gérard Schivardi : 100.0% of cluster 2
Jacques Chirac : 100.0% of cluster 2
Jean-Luc Mélenchon : 100.0% of cluster 2
Jean-Marie Le Pen : 100.0% of cluster 2
Jean-Pierre Chevènement : 100.0% of cluster 2
José Bové : 100.0% of cluster 2
Lionel Jospin : 100.0% of cluster 2
Marie-George Buffet : 100.0% of cluster 2
Marine Le Pen : 100.0% of cluster 2
Nathalie Arthaud : 100.0% of cluster 1
Nicolas Sarkozy : 100.0% of cluster 2
Olivier Besancenot : 50.0% of cluster 1
Philippe Poutou : 100.0% of cluster 2
Robert Hue : 100.0% of cluster 1
Ségolène Royal : 100.0% of cluster 2


In [10]:
# Supervised learning
# Source : https://towardsdatascience.com/training-a-naive-bayes-model-to-identify-the-author-of-an-email-or-document-17dc85fa630a

# Load train dataset
TRAIN_PATH = "./dataset/naive_bayes/nicolas_sarkozy/"
train_speach_names = [file_name for file_name in listdir(TRAIN_PATH) if isfile(join(TRAIN_PATH, file_name))]

# Sarkozy index in politicians list
sarkozy_index = politicians.index("Nicolas Sarkozy")

# Load all train speaches for Sarkozy
X_train = ()
for file_name in train_speach_names:
    with open(TRAIN_PATH + file_name, encoding='utf8') as txt:
        text = txt.read().replace("\n", " ")
        X_train = X_train + (text,)
y_train = (sarkozy_index,) * len(X_train)

In [11]:
# Load test dataset
TEST_PATH = "./dataset/naive_bayes/"
test_speach_names = [file_name for file_name in listdir(TEST_PATH) if isfile(join(TEST_PATH, file_name))]

# Load all test speaches
X_test = ()
y_test = ()
for file_name in test_speach_names:
    with open(TEST_PATH + file_name, encoding='utf8') as txt:
        text = txt.read().replace("\n", " ")
        X_test = X_test + (text,)
        y_test = y_test + (politicians.index(get_author(file_name)),)

In [12]:
french_stop_words = stopwords.words('french')
vectorizer = TfidfVectorizer(stop_words=french_stop_words)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [13]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

score_test = model.score(X_test, y_test)
print("Accuracy: {0}".format(score_test))
print(y_pred)
print(y_test)

Accuracy: 0.12658227848101267
[10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 10 10 10 10 10 10 10]
(8, 1, 0, 10, 0, 2, 3, 3, 2, 2, 2, 9, 16, 8, 6, 10, 17, 11, 15, 5, 22, 3, 4, 15, 10, 20, 3, 3, 19, 21, 8, 10, 2, 0, 15, 10, 8, 19, 3, 10, 11, 4, 8, 17, 16, 0, 0, 4, 0, 19, 10, 15, 19, 18, 13, 8, 10, 7, 15, 10, 4, 11, 3, 0, 10, 14, 8, 18, 15, 8, 3, 17, 2, 0, 21, 8, 12, 0, 8)
