# Text Classification - IMDB

# Imports 

In [None]:
import numpy as np
import pandas as pd

#for nlp
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

#text vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

#metrics
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc

#import method releated to evaluation
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit, cross_val_score, GridSearchCV

#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

#for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Global vairiables for script

cross_validation_iterations = 30


# Exploring the Dataset

In [None]:
filename = 'SMSSpamData.csv'
SPAM_dataframe = pd.read_csv(filename, encoding='utf-8')

In [None]:
SPAM_dataframe.head()

# Converting Class Index to int

In [None]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(SPAM_dataframe['class']))}

print(class_mapping)
class_labels = [x for x in class_mapping] # store the class labels for later

In [None]:
#use the mapping dictionary to transform the class labels into integers

SPAM_dataframe["class"] = SPAM_dataframe["class"].map(class_mapping)


In [None]:
SPAM_dataframe.head()

# Cleaning up the Data

In [None]:
SPAM_dataframe.loc[42, 'sms_msg']#[-50:]

In [None]:
#import regular expressions to clean up the text
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

In [None]:
preprocessor(SPAM_dataframe.loc[42, 'sms_msg'])#[-50:]

In [None]:
# apply the preprocessor to the entire dataframe (i.e. column review)
SPAM_dataframe['sms_msg'] = SPAM_dataframe['sms_msg'].apply(preprocessor)

# Tokenise, Stemmer & Stop

In [None]:
from nltk.corpus import stopwords 

nltk.download('stopwords')


stop = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def tokenizer(text):
    text = str(text)
    return text.split()

def tokenizer_stemmer(text):
    return [stemmer.stem(word) for word in tokenizer(text)]#text.split()]


def stop_removal(text):
       return [w for w in text if not w in stop]
    

In [None]:
SPAM_dataframe.loc[42, 'sms_msg']

In [None]:
# SPAM_dataframe_subset = SPAM_dataframe.sample(n=1000)
SPAM_dataframe_subset = SPAM_dataframe

# Training for Sentiment Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     SPAM_dataframe_subset.iloc[:,1], SPAM_dataframe_subset.iloc[:,0], test_size=0.20, random_state=42)

In [None]:
param_grid0 = [{'vect__ngram_range': [(1, 1)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'vect__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'vect__tokenizer': [tokenizer_stemmer]}, # use a tokeniser and the stemmer 
               ]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

results_list = []

names = []

raw_names = []

cv_results = []

predictions = []

best_settings = []


for func in [LogisticRegression(solver='lbfgs', multi_class='multinomial'),
                   MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                   SVC(kernel='rbf', gamma=0.7, C=1.0)]:
    results = {}
    
    stop_results = []
    none_results = []
    
    print("Testing: " + func.__class__.__name__)
    
    mnb_tfidf = Pipeline([('vect', tfidf),
                     ('clf',  func)])
                   
    gs_mnb_tfidf = GridSearchCV(mnb_tfidf, param_grid0,
                               scoring='accuracy',
                               cv=cross_validation_iterations,
                               verbose=1,
                               n_jobs=-1)
    
    gs_mnb_tfidf.fit(X_train, y_train)
    
         
    clf = gs_mnb_tfidf.best_estimator_
    
    print('Test Accuracy: %.3f' % (clf.score(X_test, y_test) * 100))

    print('CV Accuracy: %.3f' % (gs_mnb_tfidf.best_score_ * 100))
    
    results['cv_acc'] = gs_mnb_tfidf.best_score_* 100
    
    results['accuracy'] = clf.score(X_test, y_test) * 100
    
    results['name'] = func.__class__.__name__ 
    
    results_list.append(results)
    
    temp_split = []
    
    for split in range (0, gs_mnb_tfidf.n_splits_):
        
        print("CV split " + str(split + 1) + " results:")
        
        print(gs_mnb_tfidf.cv_results_['split'+ str(split) +'_test_score'])
        
        stop_results.append(gs_mnb_tfidf.cv_results_['split'+ str(split) +'_test_score'][0])
        none_results.append(gs_mnb_tfidf.cv_results_['split'+ str(split) +'_test_score'][1])
        
        
    predictions.append(clf.predict(X_test))
    best_settings.append(clf.get_params(GridSearchCV)['vect__stop_words']  )
        
    raw_names.append(func.__class__.__name__)
    
    names.append(func.__class__.__name__ + " StopWords")
    cv_results.append(stop_results)

    names.append(func.__class__.__name__ + " None")
    cv_results.append(none_results)
    
    

In [None]:
df_results = pd.DataFrame(results_list) 
df_results

In [None]:
# gs_mnb_tfidf.n_splits_

In [None]:
# stop_results

In [None]:
cv_results

In [None]:
for method in range(0, len(cv_results)):
    print(names[method])
    print(cv_results[method])
    print("---")

In [None]:
gs_mnb_tfidf.cv_results_

In [None]:
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(cv_results)
# ax.set_xticklabels(names)
# plt.show()

In [None]:
from matplotlib.ticker import MultipleLocator

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 10))

for predict in range(0, len(predictions)):
    fpr, tpr, threshold = roc_curve(y_test, predictions[predict]) 

# This is the AUC
    the_auc = auc(fpr, tpr)
# This is the ROC curve
    if best_settings[predict]:
        plot_label = "`" + raw_names[predict] + "` with StopWords (area = "+str(round(the_auc, 4))+")."
    else:
        plot_label = "`" + raw_names[predict] + "` with no StopWords (area = "+str(round(the_auc, 4))+")."
    axes[0].plot(fpr,tpr, label=plot_label % (the_auc))

axes[0].plot([0, 1], [0, 1], 'k--') # diagonal    

axes[0].set_title('ROC Curve \n'+
                  'Best accuracy StopWord paramiter from each model. \n' +
                 'Quantity (Train: ' + str(X_train.count()) + '; Test:' + str(X_test.count())+') ')
axes[0].set_xlabel('False positive rate')
axes[0].set_ylabel('True positive rate')
axes[0].legend(loc='best')
axes[0].xaxis.grid(True)

spacing = 0.1 # This can be your user specified spacing. 
minorLocator = MultipleLocator(spacing)

axes[0].yaxis.set_minor_locator(minorLocator)
axes[0].xaxis.set_minor_locator(minorLocator)
axes[0].grid(which = 'minor')


# box plot

bplot2 = axes[1].boxplot(cv_results,
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=names
                         )  # will be used to label x-ticks
axes[1].set_title('Model Accuracy \n'+ 
                  'Performed with ' + str(cross_validation_iterations) +' cross validation iterations. \n' + 
                  'Quantity (Train: ' + str(X_train.count()) + '; Test:' + str(X_test.count())+') ')

axes[1].set_xlabel('Models used')
axes[1].set_ylabel('Accuracy Recorded (range(0 - 1) = 0 - 100%)')

# fill with colors
colors = ['#CBD9D6', '#8DA593', '#D9CBA3', '#F2A88C', '#D98282']
for patch, color in zip(bplot2['boxes'], colors):
    patch.set_facecolor(color)

# adding horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)

    
    
plt.xticks(rotation=90)
plt.show()