In [4]:
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Téléchargement des ressources nécessaires de NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Charger les données à partir du fichier CSV
df = pd.read_csv('movie_review.csv')

# Définir les colonnes pour la classification
features_column = 'text'  # Colonnes de caractéristiques
labels_column = 'tag'     # Colonnes d'étiquettes

## Avant pretraitement

In [9]:
df['text']

0        films adapted from comic books have had plenty...
1        for starters , it was created by alan moore ( ...
2        to say moore and campbell thoroughly researche...
3        the book ( or " graphic novel , " if you will ...
4        in other words , don't dismiss this film becau...
                               ...                        
64715    that lack of inspiration can be traced back to...
64716    like too many of the skits on the current inca...
64717    after watching one of the " roxbury " skits on...
64718     bump unsuspecting women , and . . . that's all .
64719    after watching _a_night_at_the_roxbury_ , you'...
Name: text, Length: 64720, dtype: object

In [7]:
# Prétraitement des données textuelles
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenization des mots
    tokens = word_tokenize(text)
    
    # Conversion en minuscules
    tokens = [word.lower() for word in tokens]
    
    # Suppression de la ponctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Suppression des mots vides
    tokens = [word for word in tokens if word not in stop_words]
    
    # Reconstitution du texte prétraité
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Appliquer le prétraitement à chaque texte de la colonne "Text"
df['Preprocessed_Text'] = df[features_column].apply(preprocess_text)


## Apres pretraitement

In [8]:
df['Preprocessed_Text']

0        films adapted comic books plenty success wheth...
1        starters created alan moore eddie campbell bro...
2        say moore campbell thoroughly researched subje...
3        book `` graphic novel `` 500 pages long includ...
4                            words n't dismiss film source
                               ...                        
64715      lack inspiration traced back insipid characters
64716    like many skits current incarnation _saturday_...
64717    watching one `` roxbury `` skits snl come away...
64718                           bump unsuspecting women 's
64719    watching _a_night_at_the_roxbury_ 'll left exa...
Name: Preprocessed_Text, Length: 64720, dtype: object

In [10]:
# Entraînement du modèle Word2Vec
tokenized_texts = [word_tokenize(text) for text in df['Preprocessed_Text']]
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [11]:
df['Preprocessed_Text']

0        films adapted comic books plenty success wheth...
1        starters created alan moore eddie campbell bro...
2        say moore campbell thoroughly researched subje...
3        book `` graphic novel `` 500 pages long includ...
4                            words n't dismiss film source
                               ...                        
64715      lack inspiration traced back insipid characters
64716    like many skits current incarnation _saturday_...
64717    watching one `` roxbury `` skits snl come away...
64718                           bump unsuspecting women 's
64719    watching _a_night_at_the_roxbury_ 'll left exa...
Name: Preprocessed_Text, Length: 64720, dtype: object

In [12]:
# Vectorisation des reviews de movies
def get_review_vector(review):
    tokens = word_tokenize(review)
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        return [0] * 100  # Vecteur nul si aucun mot ne correspond dans le modèle

df['Review_Vector'] = df['Preprocessed_Text'].apply(get_review_vector)

In [13]:
df['Review_Vector']

0        [-0.49658206, 0.31141835, 0.23791857, 0.056281...
1        [-0.26951024, 0.30265602, 0.10224354, 0.188763...
2        [-0.40534353, 0.37222615, 0.1986156, 0.0743688...
3        [-0.6344722, 0.29854515, 0.21892644, 0.2255277...
4        [-0.5824431, 0.6223749, -0.07175214, -0.206473...
                               ...                        
64715    [0.033346962, 0.39561346, 0.011281236, -0.2208...
64716    [-0.4257373, 0.2883243, 0.22346158, 0.05913018...
64717    [-0.85216767, 0.38365075, 0.4281632, 0.2064081...
64718    [-0.33112565, 0.23932604, 0.33529422, 0.084547...
64719    [-0.5578683, 0.2947772, 0.18137416, -0.2490791...
Name: Review_Vector, Length: 64720, dtype: object

In [14]:
# Division des données
X = pd.DataFrame(df['Review_Vector'].tolist())  # Caractéristiques
y = df[labels_column]                           # Étiquettes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Construction d'un classificateur (logistic regression)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

In [16]:
# Évaluation du modèle
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.5709981458590853
Precision: 0.5718303233874755
Recall: 0.5709981458590853
F1 Score: 0.5675461155425728
