In [1]:
import numpy as np
import pandas as pd
import os
import csv
import re
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import logging
from numpy import random
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system names
        if name == "PIL":
            name = "Pillow"
        elif name == "sklearn":
            name = "scikit-learn"

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

numpy==1.20.2
scikit-learn==0.23.2
pandas==1.1.3
nltk==3.5


In [2]:
neg_direct = '../Data/review_polarity/txt_sentoken/neg/'
pos_direct = '../Data/review_polarity/txt_sentoken/pos/'

In [3]:
#Reading through the direc
def get_filename(path):
    filenames = []
    files = [i.path for i in os.scandir(path) if i.is_file()]

    for filename in files:
        filename = os.path.basename(filename)
        filenames.append(filename)
    return filenames

neg_files = get_filename(neg_direct)
pos_files = get_filename(pos_direct)

In [4]:
with open('neg.csv', 'w',  encoding = 'utf8', newline = '') as csv_file:
    for _file in neg_files:

        file_name = _file
        with open(neg_direct +'/'+ _file,'r') as f:
            text = f.read()

            writer = csv.writer(csv_file)
            writer.writerow([file_name, text])
with open('pos.csv', 'w',  encoding = 'utf8', newline = '') as csv_file:
    for _file in pos_files:

        file_name = _file
        with open(pos_direct +'/'+ _file,'r') as f:
            text = f.read()

            writer = csv.writer(csv_file)
            writer.writerow([file_name, text])
df_neg = pd.read_csv('neg.csv', header = None)
df_pos = pd.read_csv('pos.csv', header=None)

In [5]:
df_neg['Label'] = ['Negative']*len(df_neg)
df_pos['Label'] = ['Positive']*len(df_pos)

In [6]:
df_pos = df_pos.rename(columns = {0:'ID', 1:'Text'})
df_neg = df_neg.rename(columns = {0:'ID', 1:'Text'})
df_pos.head()

Unnamed: 0,ID,Text,Label
0,cv839_21467.txt,assume nothing . \nthe phrase is perhaps one o...,Positive
1,cv034_29647.txt,plot : derek zoolander is a male model . \nhe ...,Positive
2,cv908_16009.txt,i actually am a fan of the original 1961 or so...,Positive
3,cv748_12786.txt,a movie that's been as highly built up as the ...,Positive
4,cv253_10077.txt,""" good will hunting "" is two movies in one : ...",Positive


In [7]:
sources = [df_neg, df_pos]
all_data = pd.concat(sources)
all_data.head()

Unnamed: 0,ID,Text,Label
0,cv676_22202.txt,bad . bad . \nbad . \nthat one word seems to p...,Negative
1,cv839_22807.txt,isn't it the ultimate sign of a movie's cinema...,Negative
2,cv155_7845.txt,""" gordy "" is not a movie , it is a 90-minute-...",Negative
3,cv465_23401.txt,disconnect the phone line . \ndon't accept the...,Negative
4,cv398_17047.txt,when robert forster found himself famous again...,Negative


In [8]:
all_data = all_data.sample(frac=1).reset_index(drop=True)
all_data.head()

Unnamed: 0,ID,Text,Label
0,cv762_15604.txt,if you're going to make a two-hour hollywood i...,Negative
1,cv817_3675.txt,we're back in blade runner territory with this...,Negative
2,cv445_26683.txt,"phaedra cinema , the distributor of such never...",Negative
3,cv299_16214.txt,expectation rating : a bit worse than expected...,Positive
4,cv558_29507.txt,""" the endurance : shackleton's legendary anta...",Positive


In [9]:
def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
    
    text = ''.join([c for c in text if c not in STOPWORDS])
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
        
    # Return a list of words
    return(text)
all_data['Clean_text'] = all_data['Text'].apply(text_cleaning)
all_data.head()

Unnamed: 0,ID,Text,Label,Clean_text
0,cv762_15604.txt,if you're going to make a two-hour hollywood i...,Negative,f u re gng ke w hur hllw n jke wh bher rel...
1,cv817_3675.txt,we're back in blade runner territory with this...,Negative,we re bck n ble runner errr wh h ne cncepul ...
2,cv445_26683.txt,"phaedra cinema , the distributor of such never...",Negative,pher cne he rbur f uch never her f clc f ...
3,cv299_16214.txt,expectation rating : a bit worse than expected...,Positive,expecn rng b wre hn expece nl becue fun ...
4,cv558_29507.txt,""" the endurance : shackleton's legendary anta...",Positive,he enurnce hcklen legenr nrcc expen n...


In [10]:
#Feature Extraction
cv = CountVectorizer()
tfidf_vect = TfidfVectorizer()
X = all_data['Clean_text']
y = all_data['Label']
cv.fit(X)
tfidf_vect.fit(X)
X_cv = cv.transform(X)
X_tfidf = tfidf_vect.transform(X)

In [11]:
X_train_cv, X_test_cv, y_train, y_test = train_test_split(X_cv, y, test_size=0.3, random_state=42)
X_train_tf, X_test_tf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [12]:
base_model = MultinomialNB()
base_model.fit(X_train_cv, y_train)
cv_predictions = base_model.predict(X_test_cv)
base_model_tf = MultinomialNB()
base_model_tf.fit(X_train_tf, y_train)
tfidf_predictions = base_model_tf.predict(X_test_tf)

In [13]:
cv_acc = accuracy_score(cv_predictions, y_test)
tfidf_acc = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc) )

Model Accuracy using Count Vectorizer is 0.8283333333333334: Model accuracy using TFIDF is 0.675


In [14]:
print(classification_report(cv_predictions, y_test))

              precision    recall  f1-score   support

    Negative       0.86      0.79      0.83       308
    Positive       0.80      0.87      0.83       292

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600



In [15]:
base_model1 = SVC()
base_model1.fit(X_train_cv, y_train)
cv_predictions1 = base_model1.predict(X_test_cv)
base_model1_tf = SVC()
base_model1_tf.fit(X_train_tf, y_train)
tfidf_predictions1 = base_model1_tf.predict(X_test_tf)

In [16]:
cv_acc1 = accuracy_score(cv_predictions, y_test)
tfidf_acc1 = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc1) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc1) )

Model Accuracy using Count Vectorizer is 0.8283333333333334: Model accuracy using TFIDF is 0.675


In [17]:
print(classification_report(cv_predictions1, y_test))

              precision    recall  f1-score   support

    Negative       0.83      0.67      0.74       350
    Positive       0.63      0.81      0.71       250

    accuracy                           0.73       600
   macro avg       0.73      0.74      0.72       600
weighted avg       0.75      0.72      0.73       600



In [18]:
base_model1 = DecisionTreeClassifier()
base_model1.fit(X_train_cv, y_train)
cv_predictions1 = base_model1.predict(X_test_cv)
base_model1_tf = DecisionTreeClassifier()
base_model1_tf.fit(X_train_tf, y_train)
tfidf_predictions1 = base_model1_tf.predict(X_test_tf)

In [19]:
cv_acc1 = accuracy_score(cv_predictions, y_test)
tfidf_acc1 = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc1) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc1) )

Model Accuracy using Count Vectorizer is 0.8283333333333334: Model accuracy using TFIDF is 0.675


In [20]:
print(classification_report(cv_predictions1, y_test))

              precision    recall  f1-score   support

    Negative       0.61      0.56      0.59       303
    Positive       0.59      0.63      0.61       297

    accuracy                           0.60       600
   macro avg       0.60      0.60      0.60       600
weighted avg       0.60      0.60      0.60       600

