#*classification du texte*


In [86]:
#importation
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec #import de la classe Word2Vec depuis le module gensim.models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


**Etape 0: lire le fichier "movie_review.csv" et extraction des colonnes Text (features) et Tag (label)**

In [55]:


# Lecture du fichier CSV
data = pd.read_csv('drive/MyDrive/tp3_ressources/movie_review.csv')
#affichage
print(data)

# Extraction des colonnes pour la classification
new_data = data[['text', 'tag']]

#affichage du new_data
print("\n")
print("data après extraction de 'text' et 'tag' :\n")
new_data

       fold_id cv_tag  html_id  sent_id  \
0            0  cv000    29590        0   
1            0  cv000    29590        1   
2            0  cv000    29590        2   
3            0  cv000    29590        3   
4            0  cv000    29590        4   
...        ...    ...      ...      ...   
64715        9  cv999    14636       20   
64716        9  cv999    14636       21   
64717        9  cv999    14636       22   
64718        9  cv999    14636       23   
64719        9  cv999    14636       24   

                                                    text  tag  
0      films adapted from comic books have had plenty...  pos  
1      for starters , it was created by alan moore ( ...  pos  
2      to say moore and campbell thoroughly researche...  pos  
3      the book ( or " graphic novel , " if you will ...  pos  
4      in other words , don't dismiss this film becau...  pos  
...                                                  ...  ...  
64715  that lack of inspiration can

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos
...,...,...
64715,that lack of inspiration can be traced back to...,neg
64716,like too many of the skits on the current inca...,neg
64717,"after watching one of the "" roxbury "" skits on...",neg
64718,"bump unsuspecting women , and . . . that's all .",neg


**Etape 1: Pre-processing des données textuelles**

In [66]:
# Téléchargement des ressources nécessaires pour NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Définition des stop words
stop_words = set(stopwords.words('english'))

#définition de la fct preprocessing
def preprocessing(text):
  text=text.lower()  #Convertit le texte en minuscules.
  text=''.join([word for word in text if word not in string.punctuation]) #supprimer la ponctuation du texte.
  tokens=word_tokenize(text)
  tokens=[word for word in tokens if word not in stop_words] #supprimer les topwords
  return ' '.join(tokens)

# appliquer la fonction preprocessing  à chaque élément du dataFrame new_data
new_data = new_data.applymap(lambda x: preprocessing(x) if isinstance(x, str) else x)

#affichage du dataframe après preprocessing
print("new_data après preprocessing :\n")
new_data


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


new_data après preprocessing :



Unnamed: 0,text,tag
0,films adapted comic books plenty success wheth...,pos
1,starters created alan moore eddie campbell bro...,pos
2,say moore campbell thoroughly researched subje...,pos
3,book graphic novel 500 pages long includes nea...,pos
4,words dont dismiss film source,pos
...,...,...
64715,lack inspiration traced back insipid characters,neg
64716,like many skits current incarnation saturdayni...,neg
64717,watching one roxbury skits snl come away chara...,neg
64718,bump unsuspecting women thats,neg


**Etape 2 : Entraînement du modèle Word2Vec**

In [74]:
# Diviser chaque phrase en mots et stocker dans une liste
phrases=[phrase.split() for phrase in new_data['text']]

# Entraîner le modèle Word2Vec
w2v_model=Word2Vec(phrases,vector_size=100,window=5,min_count=5,workers=4)

# fonction pour vectoriser une phrase :
def vectoriser(phrase):
  mots=phrase.split()
  mots_vecs = [w2v_model.wv[mot] for mot in mots if mot in w2v_model.wv]
  if len(mots_vecs)==0:
    return np.zeros(100)
  mots_vecs=np.array(mots_vecs)
  return mots_vecs.mean(axis=0) # Calculer la moyenne des vecteurs de mots pour obtenir le vecteur de la phrase

# Vectorisation des reviews de movies  : appliquer la fonction vectoriser_review à chaque élément de la colonne 'text'
comments_vecs=np.array([vectoriser(phrase) for phrase in new_data['text']])
print(comments_vecs)


[[-0.31897467  0.33555189  0.15150923 ... -0.44357625 -0.019327
   0.01475222]
 [-0.24236088  0.13565718  0.08976796 ... -0.23055761  0.1759131
  -0.16195998]
 [-0.2948215   0.31733182  0.15595937 ... -0.46245873 -0.06118005
   0.01337652]
 ...
 [-0.28766271  0.15839337  0.1213825  ... -0.08499939 -0.14536399
   0.10729872]
 [-0.2341058   0.22140408  0.13705389 ... -0.26863402 -0.05572414
   0.0952722 ]
 [-0.55543226  0.1913057   0.04504279 ... -0.46640897 -0.26271909
   0.19888054]]


**Etape 3 : Division des données**

In [82]:
# Diviser le dataset
x = new_data['text']  # Features
y = new_data['tag']   # Label

# Diviser les données en ensembles d'entraînement et de test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# la vectorisation des reviews de films dans les ensembles d'entraînement x_train et de test x_test
x_train=np.array([vectoriser(phrase) for phrase in x_train])
x_test=np.array([vectoriser(phrase) for phrase in x_test])


**Etape 4 : Construction d un classificateur**

In [83]:
# Initialiser le modèle logistic regression
model = LogisticRegression()

# Entraîner le modèle sur l'ensemble d'entraînement x_train
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Etape 5 : Évaluation du modèle**

In [88]:
# prédiction de y_test à partir de x_test
y_pred = model.predict(x_test)

# Calculer l'accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Calculer la precision
print("Precision:", precision_score(y_test, y_pred, pos_label='pos'))

# Calculer le recall
print("Recall:", recall_score(y_test, y_pred, pos_label='pos'))

# Calculer le F1-score
print("F1-score:", f1_score(y_test, y_pred, pos_label='pos'))

Accuracy: 0.5764833127317676
Precision: 0.5700166859196509
Recall: 0.6756427810740909
F1-score: 0.6183514341409079
