In [87]:
import numpy as np
import string, re
from time import time
from pprint import pprint

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\awant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
def preprocess(text):
    documents = []
    for sen in text:
        document = re.compile(r'<br />').sub(' ', str(sen))
        document = re.sub(r'\W', ' ', document)
        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)
        document = document.translate(str.maketrans('', '', string.digits))
        # Converting to Lowercase
        document = document.lower()
        document = document.split()
        #document = [word for word in document if word not in swords]
        document = ' '.join(document)
        documents.append(document)
    return documents

def classification(data, text_label, mode):
    x_train, x_test, y_train, y_test = train_test_split(data, text_label, test_size=0.2)
    stop_list = set(stopwords.words('english'))
    c_params = [0.5, 0.7, 0.9, 1]
    for c in c_params:
        print("-----------------------------------------------------------------------")       
        print("SVM for CountVectoriser")
        if mode == 'uni':
            clf = make_pipeline(CountVectorizer(stop_words=stop_list), SVC(kernel='linear', C=c))
        elif mode == 'bi':
            clf = make_pipeline(CountVectorizer(ngram_range=(2, 2),stop_words=stop_list), SVC(kernel='linear', C=c))
        t0 = time()      
        clf.fit(x_train, y_train)
        print("done in %0.3fs" % (time() - t0))
        y_pred = clf.predict(x_test)
        print(f"C={c}, mode = {mode}")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(accuracy_score(y_test, y_pred))

        print("-----------------------------------------------------------------------")
        print("SVM for TfIdfVectoriser")                  
        if mode == 'uni':
            clf = make_pipeline(TfidfVectorizer(stop_words=stop_list), SVC(kernel='linear', C=c))
        elif mode == 'bi':
            clf = make_pipeline(TfidfVectorizer(ngram_range=(2, 2),stop_words=stop_list), SVC(kernel='linear', C=c))
        t0 = time()                        
        clf.fit(x_train, y_train)
        print("done in %0.3fs" % (time() - t0))
        y_pred = clf.predict(x_test)
        print(f"C={c}, mode = {mode}")
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))
        print(accuracy_score(y_test, y_pred))
        print("-----------------------------------------------------------------------")
    

In [82]:
movie_data = load_files(r"./movie_review_data/")

In [89]:
X, y = movie_data.data, movie_data.target

In [90]:
#try on 1/2 data
X = X[0:int(len(X)/2)]
y = y[0:int(len(y)/2)]

X = preprocess(X)

In [91]:
classification(X,y,'uni')
classification(X,y,'bi')

-----------------------------------------------------------------------
SVM for CountVectoriser
done in 94.846s
C=0.5, mode = uni
[[1050  206]
 [ 176 1068]]
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1256
           1       0.84      0.86      0.85      1244

    accuracy                           0.85      2500
   macro avg       0.85      0.85      0.85      2500
weighted avg       0.85      0.85      0.85      2500

0.8472
-----------------------------------------------------------------------
SVM for TfIdfVectoriser
done in 140.715s
C=0.5, mode = uni
[[1097  159]
 [ 109 1135]]
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1256
           1       0.88      0.91      0.89      1244

    accuracy                           0.89      2500
   macro avg       0.89      0.89      0.89      2500
weighted avg       0.89      0.89      0.89      2500

0.8928
-----------------