In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movie-review/movie_review.csv


### Importation des bibliothèques / fonctions nécessaires 

In [2]:
import pandas as pd 
import string 
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

## Pré-traitement du texte 

#### Prétraitement du feature text

In [3]:
# Importing the dataset
file_path = '/kaggle/input/movie-review/movie_review.csv'
data = pd.read_csv(file_path)

# Importing the columns I wanna work with
Text = data['text']
Tag = data['tag']

# Creating a new dataframe where I'll put my work 
Preprocessing_data = data[['text', 'tag']].copy()

#### Step 1: Normalize #####
Preprocessing_data['text'] = data['text'].str.lower()

#### Step 2: Remove stop words and tokenize ####
stop_words = set(stopwords.words('english'))
def remove_stopwords_and_tokenize(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words and not word.startswith('@')]
    return ' '.join(filtered_text)

Preprocessing_data['text'] = Preprocessing_data['text'].apply(remove_stopwords_and_tokenize)

#### Step 3: Stem ####
stemmer = PorterStemmer()
def stem_text(text):
    stemmed_text = [stemmer.stem(word) for word in word_tokenize(text)]
    return stemmed_text

Preprocessing_data['text'] = Preprocessing_data['text'].apply(stem_text)

# Preprocessed data 
print(Preprocessing_data)

                                                    text  tag
0      [film, adapt, comic, book, plenti, success, ,,...  pos
1      [starter, ,, creat, alan, moor, (, eddi, campb...  pos
2      [say, moor, campbel, thoroughli, research, sub...  pos
3      [book, (, ``, graphic, novel, ,, ``, ), 500, p...  pos
4                [word, ,, n't, dismiss, film, sourc, .]  pos
...                                                  ...  ...
64715   [lack, inspir, trace, back, insipid, charact, .]  neg
64716  [like, mani, skit, current, incarn, _saturday_...  neg
64717  [watch, one, ``, roxburi, ``, skit, snl, ,, co...  neg
64718        [bump, unsuspect, women, ,, ., ., ., 's, .]  neg
64719  [watch, _a_night_at_the_roxbury_, ,, 'll, left...  neg

[64720 rows x 2 columns]


#### Prétraitement du target tag (Convertir les valeurs catégorielles en valeurs numériques)

In [4]:
# Transformer les valeurs de la colonne 'tag' en valeurs numériques
le = LabelEncoder()
Preprocessing_data['tag'] = le.fit_transform(Preprocessing_data['tag'])

# Afficher les données prétraitées
Preprocessing_data.head()

Unnamed: 0,text,tag
0,"[film, adapt, comic, book, plenti, success, ,,...",1
1,"[starter, ,, creat, alan, moor, (, eddi, campb...",1
2,"[say, moor, campbel, thoroughli, research, sub...",1
3,"[book, (, ``, graphic, novel, ,, ``, ), 500, p...",1
4,"[word, ,, n't, dismiss, film, sourc, .]",1


## Entraînement du modèle Word2Vec

In [5]:
import gensim
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Tokenizer les données
tokenized_data = Preprocessing_data['text'].tolist()

# Construire le modèle Word2Vec
model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

# Entraîner le modèle Word2Vec
model.train(tokenized_data, total_examples=len(tokenized_data), epochs=10)

# Sauvegarder le modèle
model.save("modele_word2vec.model")

## Vectorisation des reviews des movies 

In [6]:
# Fonction pour obtenir l'embedding moyen des mots d'un commentaire
def get_average_word_embedding(commentaire, model, taille_vecteur):
    # Tokeniser le commentaire
    tokens = word_tokenize(commentaire)
    # Obtenir les embeddings de chaque mot dans le commentaire
    embeddings = [model.wv[mot] for mot in tokens if mot in model.wv]
    if embeddings:
        # Calculer l'embedding moyen
        return np.mean(embeddings, axis=0)
    else:
        # Retourner des zéros si aucun embedding n'est trouvé
        return np.zeros(taille_vecteur)

# Vectoriser les commentaires
taille_vecteur = model.vector_size
Preprocessing_data['vectorized_text'] = Preprocessing_data['text'].astype(str).apply(lambda x: get_average_word_embedding(x, model, taille_vecteur))

# Afficher les données vectorisées
print(Preprocessing_data['vectorized_text'])


0        [-1.1120574, 0.5487967, -0.84684944, -0.460104...
1        [-1.1345445, 0.5565828, -0.8483506, -0.4552571...
2        [-1.1278533, 0.5140112, -0.9623763, -0.4581380...
3        [-1.2645186, 0.5945389, -0.9256986, -0.5126380...
4        [-1.0650451, 0.47696635, -0.58653253, -0.20748...
                               ...                        
64715    [-1.100257, 0.47778407, -0.8095651, -0.4457729...
64716    [-1.0771482, 0.49123654, -0.88001746, -0.41208...
64717    [-1.1907177, 0.530403, -0.9080313, -0.44727468...
64718    [-1.1888479, 0.45145863, -0.6661015, -0.363230...
64719    [-1.076462, 0.47772917, -0.597988, -0.28583235...
Name: vectorized_text, Length: 64720, dtype: object


## Division du dataset

In [7]:
from sklearn.model_selection import train_test_split
# Séparer les features (vecteurs de mots) et les labels
X = Preprocessing_data['vectorized_text'].tolist()
y = Preprocessing_data['tag'].tolist()

# Diviser le dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Implémentation du modèle de régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

# Mettre à l'échelle les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Créer et entraîner le modèle de régression logistique avec le solveur 'saga'
log_reg = LogisticRegression(max_iter=8000, solver='saga')
log_reg.fit(X_train_scaled, y_train)

# Prédire les labels pour l'ensemble de test
y_pred = log_reg.predict(X_test_scaled)

## Calcul des métriques d'évaluation

In [None]:
# Calculer les métriques d'évaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f2_score = f1_score(y_test, y_pred, average='weighted')

# Afficher les résultats
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F2 Score: {f2_score}")