<h1><U>Initialisation environnement et chargement des données</U></h1>

<h3>Importer les modules à utiliser</h3>

In [None]:
import pandas as pd
import emoji as em
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<h3>Charger les données d'apprantissage</h3>

In [None]:
file = pd.read_xml("./data/apprentissage/train.xml")
file = file.dropna()
file

<h3>Remplacer les virgules par des points afin de convertir la colonne "note" de string à float</h3>

In [None]:
file['note'] = file['note'].apply(lambda x: x.replace(",","."))
file["note"] = pd.to_numeric(file["note"])

<h1><U>Analyse de la colonne USER_ID</U></h1>

In [None]:
file["user_id"].describe()

In [None]:
temp = file["user_id"].drop_duplicates()

users = {}
l = {}
for i in temp:
    users[i]=0.0
    l[i]=0

for i in file.index:
    u = file['user_id'][i]
    l[u] = l[u]+1
    users[u] = users[u]+file['note'][i]

for i in temp:
    users[i] = users[i]/l[i]

users = pd.DataFrame(users.items(), columns=['user_id', 'note_mean'])
users['nbr_commentaires']=[i[1] for i in l.items()]
users


In [None]:
plt.scatter(users['note_mean'], users['nbr_commentaires'], alpha=0.1)
plt.title("Nombre de commentaires en fonction de la note moyenne par utilisateur")
plt.xlabel("Note moyenne")
plt.ylabel("Nombre de commentaires")
plt.show()

<h1><U>Analyse de la colonne MOVIE</U></h1>

In [None]:
file["movie"].describe()

In [None]:
test222 = file[["movie","note"]].groupby("movie").mean()
print(test222)
test222.plot(kind="hist", bins=100, title="Nombre de films en fonction de la note moyenne")
print("Il y a plus de 100 films qui ont une note moyenne d'environ 3,2")

<h1><U>Analyse de la colonne NOTE</U></h1>

In [None]:
file["note"].describe()

In [None]:
file["note"].value_counts().sort_index().plot(kind="bar", title="Occurrences des notes presentes dans le corpus")

In [None]:
#file.plot.scatter(x='note', y='movie', title='Distribution des notes pour chaque film présent dans le corpus').set_xlabel("note")
sns.violinplot(file[["movie","note"]], x='note', y='movie')

<h1><U>Analyse de la colonne COMMENTAIRE</U></h1>

In [None]:
file["commentaire"].describe()

<h3>Analyser des émoticons</h3>

In [None]:
result = pd.DataFrame(columns=['emoji','count','note_mean','note_std'])

def get_emoji_from_corpus(commentaires):
    emoticons = []
    emoticons_count = []
    for text in commentaires:
        temp = get_emoji_from_text(text)
        for i in temp:
            if i in emoticons:
                index = emoticons.index(i)
                emoticons_count[index]+=1
            else:
                emoticons.append(i)
                emoticons_count.append(1)
    return emoticons, emoticons_count

def get_emoji_from_text(text):
    temp = []
    for i in text:
        if i in em.EMOJI_DATA and i not in temp:
                temp.append(i)
    return temp

list_of_emoticons, count = get_emoji_from_corpus(file["commentaire"])

for i in list_of_emoticons:
    df = file[file["commentaire"].str.contains(i)==True]
    index = list_of_emoticons.index(i)
    nb_count = count[index]
    if nb_count>1:
        result.loc[len(result), result.columns] = i, nb_count, df["note"].mean(), df["note"].std()

print(result)

result.to_csv("./data/processed/emoticons.csv", index=None, sep=" ", mode='w')

In [None]:
result.plot.scatter(x="note_mean", y="note_std")

In [None]:
import spacy
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nlp = spacy.load('fr_core_news_md', exclude=["parser","attribute_ruler","ner"])

# Demander le code à lorenzo

In [None]:
file_lemmed = pd.read_csv("./data/processed/lemmatized.csv")
file_lemmed

In [None]:
import re

comments_split = file_lemmed["lemmatized"].apply(lambda x: re.sub(' +', ' ', str(x)).replace(","," ").replace("."," ").replace("'"," ").replace('"'," ").lower().split(" "))
comments_split

In [None]:
comments_stop = comments_split.apply(lambda x: list(set(x) - set(stopwords.words("french"))))
comments_stop[0]


In [None]:
file_lemmed["lemmatized"]=comments_stop
file_lemmed

In [None]:
from wordcloud import WordCloud

cloud_generator = WordCloud(random_state=1,
                            height=1000, width = 1000,
                           background_color='salmon',
                           colormap = 'Pastel2',
                           collocations=False)

text = " ".join([j for i in file_lemmed["lemmatized"] for j in i ])
wordcloud_image = cloud_generator.generate(text)

plt.figure(figsize=(7, 7))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis("off")

words = text.split(" ")
words_freq = {}
for word in words:
    words_freq[word]=0
for word in words:
    words_freq[word]+=1
del words_freq[""]
df_words_all=pd.DataFrame(columns=["word","freq"])
df_words_all["word"]=words_freq.keys()
df_words_all["freq"]=words_freq.values()
df_words_all = df_words_all.sort_values(by="freq", ascending=False)

In [None]:
negatif = file_lemmed[file_lemmed["note"]<3.0]
cloud_generator = WordCloud(random_state=1,
                            height=1000, width = 1000,
                           background_color='white',
                           collocations=False)

text = " ".join([j for i in negatif["lemmatized"] for j in i ])
wordcloud_image = cloud_generator.generate(text)

plt.figure(figsize=(7, 7))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis("off")

words = text.split(" ")
words_freq = {}
for word in words:
    words_freq[word]=0
for word in words:
    words_freq[word]+=1
del words_freq[""]
df_words_negatif=pd.DataFrame(columns=["word","freq"])
df_words_negatif["word"]=words_freq.keys()
df_words_negatif["freq"]=words_freq.values()
df_words_negatif = df_words_negatif.sort_values(by="freq", ascending=False)

In [None]:
positif = file_lemmed[file_lemmed["note"]>3.0]
cloud_generator = WordCloud(random_state=1,
                            height=1000, width = 1000,
                           background_color='white',
                           collocations=False)

text = " ".join([j for i in positif["lemmatized"] for j in i ])
wordcloud_image = cloud_generator.generate(text)

plt.figure(figsize=(7, 7))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis("off")

words = text.split(" ")
words_freq = {}
for word in words:
    words_freq[word]=0
for word in words:
    words_freq[word]+=1
del words_freq[""]
df_words_positif=pd.DataFrame(columns=["word","freq"])
df_words_positif["word"]=words_freq.keys()
df_words_positif["freq"]=words_freq.values()
df_words_positif = df_words_positif.sort_values(by="freq", ascending=False)

In [None]:
print(df_words_all[20:30])

In [None]:
print(df_words_negatif[20:30])

In [None]:
print(df_words_positif[20:30])