# 1 - Préparation du traitement des données

In [None]:
import os
import time

import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
NB_DATA_140 = 100000

On a enregistré plusieurs versions des deux jeux de données préprocessées de manières différentes avec des combinaisons différentes des arguments *stop_words*, *lemmatization* et *negation*. On va créer une fonction *data* qui, en fonction des paramètres de préprocessing choisis par l'utilisateur, chargera les deux dataframes (sentiment140 et tweets scrapés) correspondants.

In [None]:
def data(stop_words, lemmatization, negation):
    file = "train"
    if stop_words:
        file += "_stop"
    if lemmatization:
        file += "_lemm"
    if negation:
        file += "_neg"
    df_140 = pd.read_pickle(os.path.join("data", "sentiment140", file + ".bz2")).sample(NB_DATA_140, random_state=1234).reset_index(drop=True)

    file = "web"
    if stop_words:
        file += "_stop"
    if lemmatization:
        file += "_lemm"
    if negation:
        file += "_neg"
    df_web = pd.read_pickle(os.path.join("data", "web", file + ".bz2"))

    X_140 = df_140.text.to_list()
    y_140 = df_140.sentiment.to_list()

    X_web = df_web.Text.to_list()
    
    return X_140, y_140, X_web, df_web

On va sélectionner le modèle pertinent ainsi que la méthode de feature extraction.

In [None]:
model = make_pipeline(TfidfVectorizer(max_features=6000, ngram_range=(1,2)), LogisticRegression(max_iter=500))

# 2 - Application du modèle

On choisit nos arguments de préprocessing et on appelle la base pertinente à l’aide de la fonction data

In [None]:
STOP_WORDS = False
LEMMATIZATION = True
NEGATION = False
X_140, y_140, X_web, df_web = data(STOP_WORDS, LEMMATIZATION, NEGATION)

Application du modèle pour labelliser les tweets scrapés

In [None]:
y_web = model.predict(X_web)

Ajout de la prédiction au dataframe des tweets

In [None]:
df_web['sentiment'] = resul

# 3 - Visualisation

In [None]:
sns.countplot(x='sentiment', data=df_web)
plt.show()

In [None]:
sns.histplot(x='Film', hue='sentiment', data=df_web)
plt.show()

In [None]:
df_web['Time Range'] = df_web['Time Range'].replace(["[2021-10-08, 2021-10-09, 2021-10-10, 2021-10-11, 2021-10-12, 2021-10-13, 2021-10-14, 2021-10-15, 2021-10-16, 2021-10-17, 2021-10-18, 2021-10-19, 2021-10-20, 2021-10-21, 2021-10-22, 2021-10-23, 2021-10-24, 2021-10-25, 2021-10-26, 2021-10-27, 2021-10-28, 2021-10-29, 2021-10-30, 2021-10-31, 2021-11-01, 2021-11-02, 2021-11-03, 2021-11-04, 2021-11-05, 2021-11-06, 2021-11-07, 2021-11-08, 2021-11-09, 2021-11-10, 2021-11-11, 2021-11-12, 2021-11-13]",
                                                     "[2021-07-03, 2021-07-04, 2021-07-05, 2021-07-06, 2021-07-07, 2021-07-08, 2021-07-09, 2021-07-10, 2021-07-11, 2021-07-12, 2021-07-13, 2021-07-14, 2021-07-15, 2021-07-16, 2021-07-17, 2021-07-18, 2021-07-19, 2021-07-20, 2021-07-21, 2021-07-22, 2021-07-23, 2021-07-24, 2021-07-25, 2021-07-26, 2021-07-27, 2021-07-28, 2021-07-29, 2021-07-30, 2021-07-31, 2021-08-01, 2021-08-02, 2021-08-03, 2021-08-04, 2021-08-05, 2021-08-06, 2021-08-07, 2021-08-08]",
                                                     "[2020-09-09, 2020-09-10, 2020-09-11, 2020-09-12, 2020-09-13, 2020-09-14, 2020-09-15, 2020-09-16, 2020-09-17, 2020-09-18, 2020-09-19, 2020-09-20, 2020-09-21, 2020-09-22, 2020-09-23]",
                                                     "[2021-07-22, 2021-07-23, 2021-07-24, 2021-07-25, 2021-07-26, 2021-07-27, 2021-07-28, 2021-07-29, 2021-07-30, 2021-07-31, 2021-08-01, 2021-08-02, 2021-08-03, 2021-08-04, 2021-08-05]",
                                                    "[2021-04-03, 2021-04-04, 2021-04-05, 2021-04-06, 2021-04-07, 2021-04-08, 2021-04-09, 2021-04-10, 2021-04-11, 2021-04-12, 2021-04-13, 2021-04-14, 2021-04-15, 2021-04-16, 2021-04-17]",
                                                    "[2021-06-09, 2021-06-10, 2021-06-11, 2021-06-12, 2021-06-13, 2021-06-14, 2021-06-15, 2021-06-16, 2021-06-17, 2021-06-18, 2021-06-19, 2021-06-20, 2021-06-21, 2021-06-22, 2021-06-23]"],
                                                    ["Sortie Dune", "Sortie Space Jam", "Premier trailer Dune", "Second trailer Dune","Premier trailer Space Jam", "Second trailer Space Jam"])

In [None]:
sns.histplot(x='Time Range', hue='sentiment', data=df_web.loc[df_web['Film'] == 'dune'])

In [None]:
sns.histplot(x='Time Range', hue='sentiment', data=df_web.loc[df_web['Film'] == 'space jam'])

In [None]:
df_web['day'] = [time.day for time in df_web['Datetime']]

In [None]:
sns.histplot(x='day', hue='sentiment', data=df_web.loc[df_web['Film'] == 'dune'].loc[df_web['Time Range'] == "Sortie Dune"])

In [None]:
sns.histplot(x='day', hue='sentiment', data=df_web.loc[df_web['Film'] == 'space jam'].loc[df_web['Time Range'] == "Sortie Space Jam"])