# Text Classification - IMDB

# Imports 

In [None]:
import numpy as np
import pandas as pd

#for nlp
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

#text vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

#metrics
from sklearn.metrics import classification_report, accuracy_score

#import method releated to evaluation
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit, cross_val_score, GridSearchCV

#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

#for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Exploring the Dataset

In [None]:
filename = 'movie_data_cat.csv'
SPAM_dataframe = pd.read_csv(filename, encoding='utf-8')

In [None]:
SPAM_dataframe.head()

# Converting Class Index to int

In [None]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(SPAM_dataframe['sentiment']))}

print(class_mapping)
class_labels = [x for x in class_mapping] # store the class labels for later

In [None]:
#use the mapping dictionary to transform the class labels into integers

SPAM_dataframe["sentiment"] = SPAM_dataframe["sentiment"].map(class_mapping)


In [None]:
SPAM_dataframe.head()

# Cleaning up the Data

In [None]:
SPAM_dataframe.loc[42, 'review']#[-50:]

In [None]:
#import regular expressions to clean up the text
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

In [None]:
preprocessor(SPAM_dataframe.loc[42, 'review'])#[-50:]

In [None]:
# apply the preprocessor to the entire dataframe (i.e. column review)
SPAM_dataframe['review'] = SPAM_dataframe['review'].apply(preprocessor)

# Tokenise, Stemmer & Stop

In [None]:
from nltk.corpus import stopwords 

nltk.download('stopwords')


stop = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def tokenizer(text):
       return text.split()

def tokenizer_stemmer(text):
    return [stemmer.stem(word) for word in tokenizer(text)]#text.split()]


def stop_removal(text):
       return [w for w in text if not w in stop]
    

In [None]:
SPAM_dataframe.loc[42, 'review']

In [None]:
SPAM_dataframe_subset = SPAM_dataframe.sample(n=2500)

# Training for Sentiment Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     SPAM_dataframe_subset.iloc[:,0], SPAM_dataframe_subset.iloc[:,1], test_size=0.20, random_state=42)

In [None]:
param_grid0 = [{'vect__ngram_range': [(1, 1)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'vect__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'vect__tokenizer': [tokenizer_stemmer]}, # use a tokeniser and the stemmer 
               ]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

results_list = []

      

for func in [LogisticRegression(),
                   MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                   SVC(kernel='rbf', gamma=0.7, C=1.0)]:
    results = {}
    
    print("Testing: " + func.__class__.__name__)
    
    mnb_tfidf = Pipeline([('vect', tfidf),
                     ('clf',  func)])
                   
    gs_mnb_tfidf = GridSearchCV(mnb_tfidf, param_grid0,
                               scoring='accuracy',
                               cv=2,
                               verbose=1,
                               n_jobs=-1)
    
    gs_mnb_tfidf.fit(X_train, y_train)
    
    
    
#     print('Best parameter set: %s ' % gs_mnb_tfidf.best_params_)
#     
    clf = gs_mnb_tfidf.best_estimator_
    
    print('Test Accuracy: %.3f' % (clf.score(X_test, y_test) * 100))

    print('CV Accuracy: %.3f' % gs_mnb_tfidf.best_score_)
    
    results['cv_acc'] = gs_mnb_tfidf.best_score_* 100
    
    results['accuracy'] = clf.score(X_test, y_test) * 100
    
    results['name'] = func.__class__.__name__ 
    
    results_list.append(results)


In [None]:
df_results = pd.DataFrame(results_list) 
df_results

In [None]:
gs_mnb_tfidf.n_splits_

In [None]:
# gs_mnb_tfidf.cv_results_

In [None]:
gs_mnb_tfidf.cv_results_['split0_test_score']

In [None]:
# from plotnine import *
# (ggplot(df_results, aes(x='name', y='accuracy', fill='factor(cv)'))
#  + geom_col()
# )