In [2]:
import pyprind
import pandas as pd
import os

In [3]:
basepath = '/home/ivan/data_science/ML/aclImdb'

In [4]:
#creiamo un dataframe con i file delle recensioni
labels = {'pos':1, 'neg':0}
pbar=pyprind.ProgBar(50000)
df=pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos','neg') :
        path=os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt=infile.read()
                df=df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()
df.columns=['review','sentiment']                
                
                

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:10:03


In [5]:
df

Unnamed: 0,review,sentiment
0,A recent re-issue of the French crime film (or...,1
1,I watched Cold Mountain and the English Patien...,1
2,For those who have enjoyed the Asterix books a...,1
3,are you crazy or what? this movie has talent w...,1
4,"The Stepford Children, besides being a very go...",1
5,BLACK WATER is a thriller that manages to comp...,1
6,Black Scorpion is a fun flick about a groovy f...,1
7,"Actually, I have more a question, than a comme...",1
8,This movie was never intended as a big-budget ...,1
9,"Being from eastern PA, right on the border of ...",1


In [6]:
#creiamo un file csv shuffolando i valori
import numpy as np
np.random.seed(0)
df=df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [7]:
#leggiamo il file csv
df=pd.read_csv('movie_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"Compared to the acclaimed Hollywood remake, th...",1
1,Worst Movie I Have Ever Seen! 90 Minutes of ex...,0
2,If this movie were more about Piper Perabo's c...,0
3,Gabe Ryan (Frankie Thomas) gets out of reform ...,1
4,Have you ever seen a movie made up entirely of...,0


In [8]:
#eliminiamo i caratteri markup, individuiamo le emoticon e le mettiamo alla fine del testo, convertiamo tutto il testo in minuscolo
import re
def preprocessor(text):
    text=re.sub('<[^>]*>','',text)
    emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text=(re.sub('[\W]+', ' ',text.lower())+' '.join(emoticons).replace('-',''))
    return text
df['review']=df['review'].apply(preprocessor)

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,compared to the acclaimed hollywood remake thi...,1
1,worst movie i have ever seen 90 minutes of exc...,0
2,if this movie were more about piper perabo s c...,0
3,gabe ryan frankie thomas gets out of reform sc...,1
4,have you ever seen a movie made up entirely of...,0


In [10]:
#dividiamo le 50000 recensioni in train e test
X_train=df.loc[:25000, 'review'].values
y_train=df.loc[:25000, 'sentiment'].values
X_test=df.loc[25000:, 'review'].values
y_test=df.loc[25000:, 'sentiment'].values

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
#importiamo il vocabolario inglese di stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/ivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#separiamo le varie parole delle recensioni
def tokenizer(text):
    return text.split()

In [14]:
#definiamo il Porter stemmer per trasformare le varie parole separae nelle loro radici
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [15]:
#calcoliamo la term frequency - inverse document frequency (vettorizzazione delle recensioni)
tfidf=TfidfVectorizer(strip_accents=None,
                     lowercase=False,
                     preprocessor=None)

In [16]:
#prepariamo i valori dei parametri per grid search
param_grid=[{'vect__ngram_range':[(1,1)], #bag of words con una solo parola per token
            'vect__stop_words': [stop, None], #proviamo sia ad elimnare le stop words (parole ricorrenti e non influenti)che a non eliminarle
            'vect__tokenizer': [tokenizer, tokenizer_porter], #applichiamo lo stemming: le parole vengono trasformate nella loro radice
             'clf__penalty': ['l1','l2'],
             'clf__C': [1.0, 10.0, 100.0]},
            {'vect__ngram_range':[(1,1)], #bag of words con una solo parola per token
            'vect__stop_words': [stop, None], #proviamo sia ad elimnare le stop words (parole ricorrenti e non influenti)che a non eliminarle
            'vect__tokenizer': [tokenizer, tokenizer_porter], #applichiamo lo stemming: le parole vengono trasformate nella loro radice
             'vect__use_idf': [False],
             'vect__norm': [None],
             'clf__penalty': ['l1','l2'],
             'clf__C': [1.0, 10.0, 100.0]}      
           ] 

In [17]:
#impostiamo la pipeline con vettorizzazione e classificazione
lr_tfidf=Pipeline([('vect', tfidf),
                   ('clf', LogisticRegression(random_state=0))])

In [18]:
#Eseguiamo il grid search con i parametri definiti sopra
gs_lr_tfidf=GridSearchCV(lr_tfidf, param_grid,
                        scoring='accuracy',
                        cv=5,
                        verbose=1,
                        n_jobs=-1)

In [19]:
#addestriamo il modello
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 100.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 137.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [20]:
print('Miglior set din parametri:  %s' %gs_lr_tfidf.best_params_)

Miglior set din parametri:  {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f084504b598>}


In [21]:
print('Accuratezza: %.3f' %gs_lr_tfidf.best_score_)

Accuratezza: 0.893


In [22]:
clf=gs_lr_tfidf.best_estimator_

In [23]:
print('Accuratezza sui dati test: %.3f' %clf.score(X_test, y_test))

Accuratezza sui dati test: 0.901
