In [1]:
import numpy as np 
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Importation data

In [2]:
data = pd.read_csv('../input/movie-review/movie_review.csv')
data

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos
...,...,...,...,...,...,...
64715,9,cv999,14636,20,that lack of inspiration can be traced back to...,neg
64716,9,cv999,14636,21,like too many of the skits on the current inca...,neg
64717,9,cv999,14636,22,"after watching one of the "" roxbury "" skits on...",neg
64718,9,cv999,14636,23,"bump unsuspecting women , and . . . that's all .",neg


# Prétraitement des données textuelles

In [3]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in STOP_WORDS and token not in string.punctuation]
    return tokens

data['text'] = data['text'].apply(preprocess_text)
data

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,"[films, adapted, comic, books, plenty, success...",pos
1,0,cv000,29590,1,"[starters, created, alan, moore, eddie, campbe...",pos
2,0,cv000,29590,2,"[moore, campbell, thoroughly, researched, subj...",pos
3,0,cv000,29590,3,"[book, ``, graphic, novel, ``, 500, pages, lon...",pos
4,0,cv000,29590,4,"[words, dismiss, film, source]",pos
...,...,...,...,...,...,...
64715,9,cv999,14636,20,"[lack, inspiration, traced, insipid, characters]",neg
64716,9,cv999,14636,21,"[like, skits, current, incarnation, _saturday_...",neg
64717,9,cv999,14636,22,"[watching, ``, roxbury, ``, skits, snl, come, ...",neg
64718,9,cv999,14636,23,"[bump, unsuspecting, women]",neg


# Entraînement du modèle Word2Vec

In [4]:
model = Word2Vec(sentences=data['text'], vector_size=10, window=5, min_count=1)

# Vectorisation des reviews de movies

In [5]:
def average_word2vec(tokens, model, vector_size):
    feature_vector = np.zeros((vector_size,), dtype="float32")
    num_words = 0
    
    for token in tokens:
        if token in model.wv.key_to_index:
            feature_vector = np.add(feature_vector, model.wv[token])
            num_words += 1
    
    if num_words != 0:
        feature_vector = np.divide(feature_vector, num_words)
        
    return feature_vector

data['Vector'] = data['text'].apply(lambda x: average_word2vec(x, model, 10))

# Division des données

In [6]:
x = data['Vector'].to_numpy()
y = data['tag']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Construction d'un classificateur par 'Logistic Regression'

In [7]:
# On doit convertir les listes de vecteurs en une matrice numpy 2D pour x_train
x_train = np.array([vec for vec in x_train])

# On doit convertir les listes de vecteurs en une matrice numpy 2D pour x_test
x_test = np.array([vec for vec in x_test])

logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(x_train, y_train)

# Évaluation du modèle

In [8]:
y_pred = logreg_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy :", accuracy)
print("Precision :", precision)
print("Recall    :", recall)
print("F1 score  :", f1)


Accuracy : 0.5383189122373301
Precision : 0.539234283131808
Recall    : 0.5383189122373301
F1 score  : 0.5287508721113761
