In [3]:
import pandas as pd

In [4]:
# read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('MoviePosNeg/neg/', 0) # NEG
dfpos = data2df('MoviePosNeg/pos/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
367,cv367_22792.txt,""" the end of the affair "" is a dark and moody...",1
858,cv858_20266.txt,"the haunting , a film so confusing that it for...",0
524,cv524_23627.txt,even if i did not know that director and co-wr...,1
102,cv102_8306.txt,"ever since wargames , the first real computer ...",0
467,cv467_25773.txt,catherine deane ( jennifer lopez ) is a psycho...,1
132,cv132_5423.txt,""" showgirls "" is the first big-budget , big-s...",0
351,cv351_15458.txt,"though it is a fine piece of filmmaking , ther...",1
77,cv077_23172.txt,ahh yes . \nthe teenage romance . \nan attract...,0
752,cv752_25330.txt,"bad movies described as "" a swift descent into...",0
729,cv729_10154.txt,"titantic , writer and director james cameron's...",1


In [14]:
# setup the data
X, y = df['text'], df['class']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
# setup a custom preprocessor
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def preprocess(text):
    # replace one or more white-space characters with a space
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)    
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)           
    # remove stop words
    sw = stopwords.words('english')
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    # remove short words
    ' '.join([w for w in text.split() if len(w) >= 2])
    # lemmatize
    text = ' '.join([(WordNetLemmatizer()).lemmatize(w) for w in text.split()]) 
    return text

In [16]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tv = TfidfVectorizer(
#     preprocessor=preprocess,
#     #lowercase=True, stop_words='english', 
#     use_idf=True, smooth_idf=True, norm='l2',
#     min_df=1, max_df=1.0, max_features=None, 

#     ngram_range=(1, 1))
# XTtrain = pd.DataFrame(tv.fit_transform(Xtrain).toarray(), columns=tv.get_feature_names())
# XTtrain.head()

In [17]:
# setup the preprocessing->model pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = Pipeline(steps=[
    ('pp', TfidfVectorizer(
        #preprocessor=preprocess,
        lowercase=True, stop_words='english', 
        use_idf=True, smooth_idf=True, norm='l2',
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1))),
    ('mdl',     MultinomialNB())
    #('mdl',     RandomForestClassifier())
    ])

In [18]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'mdl__alpha':[0.01, 0.1, 0.2, 0.5, 1]
    #'mdl__n_estimators':[500, 700, 1000]
}
gscv = GridSearchCV(clf, param_grid, iid=False, cv=4, return_train_score=False)

In [19]:
# search for best parameters/estimator
gscv.fit(Xtrain, ytrain)

#print(gscv.best_estimator_, "\n")
#print(gscv.best_score_, "\n")
print(gscv.best_params_, "\n")
#print(gscv.cv_results_, "\n")

{'mdl__alpha': 1} 



In [20]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.8
[[162  43]
 [ 37 158]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       205
           1       0.79      0.81      0.80       195

   micro avg       0.80      0.80      0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

