# Text Classification - IMDB

# Imports 

In [1]:
import numpy as np
import pandas as pd

#for nlp
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

#text vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

#metrics
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc

#import method releated to evaluation
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV

#classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

#for graphs
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Global vairiables for script

cross_validation_iterations = 30


# Exploring the Dataset

In [3]:
filename = 'SMSSpamData.csv'
SPAM_dataframe = pd.read_csv(filename, encoding='utf-8')

In [4]:
SPAM_dataframe.head()

Unnamed: 0,class,sms_msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Converting Class Index to int

In [5]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(SPAM_dataframe['class']))}

print(class_mapping)
class_labels = [x for x in class_mapping] # store the class labels for later

{'ham': 0, 'spam': 1}


In [6]:
#use the mapping dictionary to transform the class labels into integers

SPAM_dataframe["class"] = SPAM_dataframe["class"].map(class_mapping)


In [7]:
SPAM_dataframe.head()

Unnamed: 0,class,sms_msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Cleaning up the Data

In [8]:
SPAM_dataframe.loc[42, 'sms_msg']#[-50:]

'07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow'

In [9]:
#import regular expressions to clean up the text
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # remove all html markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # findall the emoticons
    
    # remove the non-word chars '[\W]+'
    # append the emoticons to end 
    #convert all to lowercase
    # remove nose char for consistency
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', '')) 
    return text

In [10]:
preprocessor(SPAM_dataframe.loc[42, 'sms_msg'])#[-50:]

'07732584351 rodger burns msg we tried to call you re your reply to our sms for a free nokia mobile free camcorder please call now 08000930705 for delivery tomorrow'

In [11]:
# apply the preprocessor to the entire dataframe (i.e. column review)
SPAM_dataframe['sms_msg'] = SPAM_dataframe['sms_msg'].apply(preprocessor)

# Tokenise, Stemmer & Stop

In [12]:
from nltk.corpus import stopwords 

nltk.download('stopwords')


stop = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def tokenizer(text):
    text = str(text)
    return text.split()

def tokenizer_stemmer(text):
    return [stemmer.stem(word) for word in tokenizer(text)]#text.split()]


def stop_removal(text):
       return [w for w in text if not w in stop]
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
SPAM_dataframe.loc[42, 'sms_msg']

'07732584351 rodger burns msg we tried to call you re your reply to our sms for a free nokia mobile free camcorder please call now 08000930705 for delivery tomorrow'

In [14]:
# SPAM_dataframe_subset = SPAM_dataframe.sample(n=1000)
SPAM_dataframe_subset = SPAM_dataframe

# Training for Sentiment Classification

In [15]:
X_val_train, X_test, y_val_train, y_test = train_test_split(
     SPAM_dataframe_subset.iloc[:,1], SPAM_dataframe_subset.iloc[:,0], test_size=0.20, random_state=42)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(
     X_val_train, y_val_train, test_size=0.20, random_state=42)

In [17]:
print("Number of entries for Training of best models: " + str(len(X_train)))
print("Number of entries for Validating best models: " + str(len(X_val)))
print("Number of entries for Testing best models: " + str(len(X_test)))

Number of entries for Training of best models: 3567
Number of entries for Validating best models: 892
Number of entries for Testing best models: 1115


In [18]:
param_grid0 = [{'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], #can also extract 2-grams of words in addition to the 1-grams (individual words)
               'vect__stop_words': [stop, None], # use the stop dictionary of stopwords or not
               'vect__tokenizer': [tokenizer_stemmer]}, # use a tokeniser and the stemmer 
               ]

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC


tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)


my_models = []


for func in [LogisticRegression(),
                   MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                   SVC(kernel='rbf', gamma=0.7, C=1.0)]:
    results = {}
    
    stop_results = []
    none_results = []
    
    print("Testing: " + func.__class__.__name__)
    
    mnb_tfidf = Pipeline([('vect', tfidf),
                     ('clf',  func)])
                   
    gs_mnb_tfidf = RandomizedSearchCV(mnb_tfidf, param_grid0,
                               scoring='accuracy',
                               cv=cross_validation_iterations,
                               verbose=1,
                               n_jobs=-1)
    
    gs_mnb_tfidf.fit(X_val, y_val)
    
         
    clf = gs_mnb_tfidf.best_estimator_
    
    if clf.get_params()['vect__stop_words']:
        stopwords_used = 'yes'
    else:
        stopwords_used = 'no'
    
    my_models.append((str(func.__class__.__name__) + ': w/stopwords removed = ' + stopwords_used, clf))
    
    
    print("Model cv score: " + str(gs_mnb_tfidf.best_score_* 100))
    
print('Best models created!')

Testing: LogisticRegression


AttributeError: 'list' object has no attribute 'values'

In [None]:
results = []
names = []
roc_prediction = []
for name, model in my_models:
    print('---------------------------------')
    print('Cross validation of model: ' + name + ' with settings: \n')
    print(str(model.get_params()['clf']) + '\n')
    
    cv = ShuffleSplit(n_splits=cross_validation_iterations, test_size=0.2, random_state=42)
    
    cv_results = model_selection.cross_val_score(model, X_train, y_train, 
                                                 cv=cv, scoring='accuracy',
                                                 verbose=1,
                                                 n_jobs=-1)
    roc_prediction.append(model.predict(X_test))
    print(cv_results)
    results.append(cv_results)
    names.append(name)
    print('---------------------------------')
    print('Complete')
    

In [None]:
from matplotlib.ticker import MultipleLocator

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 10))

for result in range(0, len(results)):
    fpr, tpr, threshold = roc_curve(y_test, roc_prediction[result]) 

# This is the AUC
    the_auc = auc(fpr, tpr)
# This is the ROC curve

    plot_label = "`" + names[result] + "` (area = "+str(round(the_auc, 4))+")."
    
    axes[0].plot(fpr,tpr, label=plot_label)

axes[0].plot([0, 1], [0, 1], 'k--') # diagonal    

axes[0].set_title('ROC Curve \n'+
                  'Best accuracy StopWord paramiter from each model. \n' +
                 'Quantity (Train: ' + str(X_train.count()) + '; Test:' + str(X_test.count())+') ')
axes[0].set_xlabel('False positive rate')
axes[0].set_ylabel('True positive rate')
axes[0].legend(loc='best')
axes[0].xaxis.grid(True)

spacing = 0.1 # This can be your user specified spacing. 
minorLocator = MultipleLocator(spacing)

axes[0].yaxis.set_minor_locator(minorLocator)
axes[0].xaxis.set_minor_locator(minorLocator)
axes[0].grid(which = 'minor')


# box plot

bplot2 = axes[1].boxplot(results,
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=names
                         )  # will be used to label x-ticks
axes[1].set_title('Model Accuracy \n'+ 
                  'Performed with ' + str(cross_validation_iterations) +' cross validation iterations. \n' + 
                  'Quantity (Train: ' + str(X_train.count()) + '; Test:' + str(X_test.count())+') ')

axes[1].set_xlabel('Models used')
axes[1].set_ylabel('Accuracy Recorded (range(0 - 1) = 0 - 100%)')

# fill with colors
colors = ['#CBD9D6', '#8DA593', '#D9CBA3', '#F2A88C', '#D98282']
for patch, color in zip(bplot2['boxes'], colors):
    patch.set_facecolor(color)

# adding horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)

    
    
plt.xticks(rotation=90)
plt.show()