In [None]:
import pandas as pd
import numpy as np
import re
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Data Preparation

In [None]:
# Creating a data frame from the movie data set
df=pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

In [None]:
# How many reviews exist in the data set?
df.shape

In [None]:
# Are there any special charaters in the reviews?
df.loc[0,'review'][-50:]

In [None]:
# Creating a function for removal of special characters and html while keeping emoticons
def preprocessor(text):
    text=re.sub('<[^>]*>', '',text)
    emoticons= re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text=re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-','')
    return text

In [None]:
# Testing the preprpocessor
preprocessor(df.loc[0,'review'][-50:])

In [None]:
# Applying preprocessor on the reviews in the data set
df['review']=df['review'].apply(preprocessor)

In [None]:
# Creating a tokenizer function
def tokenizer(text):
    return text.split()

In [None]:
stop=stopwords.words('english')

# Model Selection

In [None]:
# Dividing the data set for training and testing
X_train, X_test, y_train, y_test = train_test_split(df.loc[:,'review'], df.loc[:,'sentiment'], test_size=0.2, random_state=1)

In [None]:
# Comparing logistic regression and support vector machines for the classification task at hand and optimizing 
# the respective hyperparameters    

tfidf = TfidfVectorizer(lowercase=False)

pipeline = Pipeline([
    ('vect', tfidf),
    ('clf', None),
])

parameters = [
    {
        'vect__stop_words':[stop, None],
        'vect__tokenizer':[tokenizer],
        'clf':[LinearSVC(random_state=0, dual=False, max_iter=10000)], 
        'clf__C':[0.1,1, 10, 100]
        
    },
    {
        'vect__stop_words':[stop, None],
        'vect__tokenizer':[tokenizer],
        'vect__use_idf':[False],
        'vect__smooth_idf':[False],
        'vect__norm':[None],
        'clf':[LinearSVC(random_state=0, dual=False, max_iter=10000)], 
        'clf__C':[0.1,1, 10, 100]
                
    },
    {
        'vect__stop_words':[stop, None],
        'vect__tokenizer':[tokenizer],
        'clf': [LogisticRegression(random_state=0, solver='liblinear')], 
        'clf__penalty':['l1', 'l2'],
        'clf__C':[1.0, 10.0, 100.0]
        
    },
    {
        'vect__stop_words':[stop, None],
        'vect__tokenizer':[tokenizer],
        'vect__use_idf':[False],
        'vect__smooth_idf':[False],
        'vect__norm':[None],
        'clf': [LogisticRegression(random_state=0, solver='liblinear')], 
        'clf__penalty':['l1', 'l2'],
        'clf__C':[1.0, 10.0, 100.0]
        
        
    }
    
]

gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=1, scoring='accuracy', verbose=3)
gscv.fit(X_train, y_train) 

In [None]:
# What is the best model?
print('best model: %s ' %gscv.best_params_)

In [None]:
# Average accuracy score of the best model determined through grid search (using a fivefold cross validation) 
# Accuracy score of the best model for the test dataset

print('CV-accuracy: %.3f'% gscv.best_score_)
clf = gscv.best_estimator_
print('accuracy test: %.3f'% clf.score(X_test, y_test))


In [None]:
# Utilizing the model in a hypothecial example
label={0:'negative', 1:'positive'}
example=["I didn't like the movie. It didn't follow the story of the book that it's based on"]
print('Prediction: %s\nProbability: %.2f%%' %\
     (label[clf.predict(example)[0]],
     np.max(clf.predict_proba(example))*100))