In [1]:
import pandas as pd
import numpy as np


full_df = pd.read_csv("/kaggle/input/movie-review/movie_review.csv")
df = full_df[["text"]].copy()
df["text"] = df["text"].astype(str)
full_df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


# Preprocessing des données****

Suppression des stop words

In [2]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
stop_words = set(stopwords.words('english')) 
def remove_stop_words(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df["text_wo_stop_words"] = df["text"].apply(lambda text: remove_stop_words(text))
df.head()

Unnamed: 0,text,text_wo_stop_words
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe..."
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell..."
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in..."
4,"in other words , don't dismiss this film becau...","words , dismiss film source ."


mise en minuscule

In [3]:
df["text_lower"] = df["text_wo_stop_words"].str.lower()
df.head()

Unnamed: 0,text,text_wo_stop_words,text_lower
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe...","films adapted comic books plenty success , whe..."
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell...","starters , created alan moore ( eddie campbell..."
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in...","book ( "" graphic novel , "" ) 500 pages long in..."
4,"in other words , don't dismiss this film becau...","words , dismiss film source .","words , dismiss film source ."


Suppression de ponctuation

In [4]:
import string
punctuation = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punctuation))
df["preprocessed_text"] = df["text_wo_stop_words"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,text,text_wo_stop_words,text_lower,preprocessed_text
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe...","films adapted comic books plenty success , whe...",films adapted comic books plenty success whet...
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell...","starters , created alan moore ( eddie campbell...",starters created alan moore eddie campbell ...
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in...","book ( "" graphic novel , "" ) 500 pages long in...",book graphic novel 500 pages long include...
4,"in other words , don't dismiss this film becau...","words , dismiss film source .","words , dismiss film source .",words dismiss film source


# Entrainement du modèle Word2Vec

In [5]:
from nltk.tokenize import word_tokenize

df["tokenized_text"] = df["preprocessed_text"].apply(word_tokenize)
df.head()

Unnamed: 0,text,text_wo_stop_words,text_lower,preprocessed_text,tokenized_text
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe...","films adapted comic books plenty success , whe...",films adapted comic books plenty success whet...,"[films, adapted, comic, books, plenty, success..."
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell...","starters , created alan moore ( eddie campbell...",starters created alan moore eddie campbell ...,"[starters, created, alan, moore, eddie, campbe..."
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,"[say, moore, campbell, thoroughly, researched,..."
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in...","book ( "" graphic novel , "" ) 500 pages long in...",book graphic novel 500 pages long include...,"[book, graphic, novel, 500, pages, long, inclu..."
4,"in other words , don't dismiss this film becau...","words , dismiss film source .","words , dismiss film source .",words dismiss film source,"[words, dismiss, film, source]"


In [6]:
from gensim.models import Word2Vec
model =  Word2Vec(sentences = df["tokenized_text"], vector_size=100, window=5, min_count=1, workers=4)
df["word_vectors"] = None
for i, tokenized_text in enumerate(df["tokenized_text"]):
    word_vectors = []
    for token in tokenized_text:
        if token in model.wv:
            word_vector = model.wv[token]
            word_vectors.append(word_vector)
    df.at[i, 'word_vectors'] = word_vectors
df.head()

Unnamed: 0,text,text_wo_stop_words,text_lower,preprocessed_text,tokenized_text,word_vectors
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe...","films adapted comic books plenty success , whe...",films adapted comic books plenty success whet...,"[films, adapted, comic, books, plenty, success...","[[-1.0533507, 0.2111854, 0.7168706, 0.86349934..."
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell...","starters , created alan moore ( eddie campbell...",starters created alan moore eddie campbell ...,"[starters, created, alan, moore, eddie, campbe...","[[-0.027641581, 0.017273474, 0.030309223, -0.0..."
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,"[say, moore, campbell, thoroughly, researched,...","[[-0.7298285, 0.9981885, -0.01141301, 0.341162..."
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in...","book ( "" graphic novel , "" ) 500 pages long in...",book graphic novel 500 pages long include...,"[book, graphic, novel, 500, pages, long, inclu...","[[-0.3657412, 0.301753, 0.65252155, -0.0525106..."
4,"in other words , don't dismiss this film becau...","words , dismiss film source .","words , dismiss film source .",words dismiss film source,"[words, dismiss, film, source]","[[-0.322295, 0.4877711, 0.33561867, -0.0959588..."


# Vectorisation des reviews de movies

In [7]:
df["review_vector"] = None
reviews_list = df['text'].apply(lambda x: x.split()).tolist()
for i, word_vectors in enumerate(df["word_vectors"]):
    if len(word_vectors) > 0:
        average_vector = np.mean(word_vectors, axis=0)
        df.at[i, "review_vector"] = average_vector

df.head()

Unnamed: 0,text,text_wo_stop_words,text_lower,preprocessed_text,tokenized_text,word_vectors,review_vector
0,films adapted from comic books have had plenty...,"films adapted comic books plenty success , whe...","films adapted comic books plenty success , whe...",films adapted comic books plenty success whet...,"[films, adapted, comic, books, plenty, success...","[[-1.0533507, 0.2111854, 0.7168706, 0.86349934...","[-0.3573358, 0.43363076, 0.3361319, -0.0330491..."
1,"for starters , it was created by alan moore ( ...","starters , created alan moore ( eddie campbell...","starters , created alan moore ( eddie campbell...",starters created alan moore eddie campbell ...,"[starters, created, alan, moore, eddie, campbe...","[[-0.027641581, 0.017273474, 0.030309223, -0.0...","[-0.20687117, 0.15761329, 0.31639022, -0.18995..."
2,to say moore and campbell thoroughly researche...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,say moore campbell thoroughly researched subje...,"[say, moore, campbell, thoroughly, researched,...","[[-0.7298285, 0.9981885, -0.01141301, 0.341162...","[-0.33312535, 0.486958, 0.32377997, -0.1588604..."
3,"the book ( or "" graphic novel , "" if you will ...","book ( "" graphic novel , "" ) 500 pages long in...","book ( "" graphic novel , "" ) 500 pages long in...",book graphic novel 500 pages long include...,"[book, graphic, novel, 500, pages, long, inclu...","[[-0.3657412, 0.301753, 0.65252155, -0.0525106...","[-0.2420425, 0.2460383, 0.26388946, -0.0148352..."
4,"in other words , don't dismiss this film becau...","words , dismiss film source .","words , dismiss film source .",words dismiss film source,"[words, dismiss, film, source]","[[-0.322295, 0.4877711, 0.33561867, -0.0959588...","[-0.274918, 0.23367628, 0.40009725, 0.27528965..."


# Division des données

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(full_df['tag'].unique())
df['tag'] = le.fit_transform(full_df['tag'])
# Définir les features  et la target 
X = df["review_vector"].values
y = df["tag"].values


#Diviser les données en train set et test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### Construction d'un classificateur

In [9]:
'''from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)'''

'from sklearn.linear_model import LogisticRegression\n\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)'

# Evaluation du model

In [10]:
'''y_pred=logistic_model.predict(X_test)
logistic_model.score(X_test,y_test)
'''

'y_pred=logistic_model.predict(X_test)\nlogistic_model.score(X_test,y_test)\n'