In [1]:
import numpy as np
import pandas as pd
import os
import csv
import re
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import logging
from numpy import random
import nltk
import pkg_resources
import types

In [2]:
neg_direct = '../Data/review_polarity/txt_sentoken/neg/'
pos_direct = '../Data/review_polarity/txt_sentoken/pos/'

In [3]:
#Reading through the direc
def get_filename(path):
    filenames = []
    files = [i.path for i in os.scandir(path) if i.is_file()]

    for filename in files:
        filename = os.path.basename(filename)
        filenames.append(filename)
    return filenames

neg_files = get_filename(neg_direct)
pos_files = get_filename(pos_direct)

In [4]:
with open('neg.csv', 'w',  encoding = 'utf8', newline = '') as csv_file:
    for _file in neg_files:

        file_name = _file
        with open(neg_direct +'/'+ _file,'r') as f:
            text = f.read()

            writer = csv.writer(csv_file)
            writer.writerow([file_name, text])
with open('pos.csv', 'w',  encoding = 'utf8', newline = '') as csv_file:
    for _file in pos_files:

        file_name = _file
        with open(pos_direct +'/'+ _file,'r') as f:
            text = f.read()

            writer = csv.writer(csv_file)
            writer.writerow([file_name, text])
df_neg = pd.read_csv('neg.csv', header = None)
df_pos = pd.read_csv('pos.csv', header=None)

In [5]:
df_neg['Label'] = ['Negative']*len(df_neg)
df_pos['Label'] = ['Positive']*len(df_pos)

In [6]:
df_pos = df_pos.rename(columns = {0:'ID', 1:'Text'})
df_neg = df_neg.rename(columns = {0:'ID', 1:'Text'})
df_pos.head()

Unnamed: 0,ID,Text,Label
0,cv839_21467.txt,assume nothing . \nthe phrase is perhaps one o...,Positive
1,cv034_29647.txt,plot : derek zoolander is a male model . \nhe ...,Positive
2,cv908_16009.txt,i actually am a fan of the original 1961 or so...,Positive
3,cv748_12786.txt,a movie that's been as highly built up as the ...,Positive
4,cv253_10077.txt,""" good will hunting "" is two movies in one : ...",Positive


In [7]:
sources = [df_neg, df_pos]
all_data = pd.concat(sources)
all_data.head()

Unnamed: 0,ID,Text,Label
0,cv676_22202.txt,bad . bad . \nbad . \nthat one word seems to p...,Negative
1,cv839_22807.txt,isn't it the ultimate sign of a movie's cinema...,Negative
2,cv155_7845.txt,""" gordy "" is not a movie , it is a 90-minute-...",Negative
3,cv465_23401.txt,disconnect the phone line . \ndon't accept the...,Negative
4,cv398_17047.txt,when robert forster found himself famous again...,Negative


In [8]:
all_data = all_data.sample(frac=1).reset_index(drop=True)
all_data.head()

Unnamed: 0,ID,Text,Label
0,cv842_5702.txt,"according to the publicity material , with thi...",Negative
1,cv793_15235.txt,this is the worst movie i've viewed so far in ...,Negative
2,cv501_12675.txt,"synopsis : easily-angered , chainsmoking archi...",Negative
3,cv249_12674.txt,synopsis : blond criminal psychologist sarah c...,Negative
4,cv804_10862.txt,satirical films usually fall into one of two c...,Positive


In [None]:
def text_cleaning(text):
    # Clean the text data

    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = text.lower()  # set in lowercase 
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    text = nltk.pos_tag(text)

    # Return a list of words
    return(text)
all_data['Clean_text'] = all_data['Text'].apply(text_cleaning)
all_data.head()

In [None]:
#Feature Extraction
cv = CountVectorizer()
tfidf_vect = TfidfVectorizer()
X = all_data['Clean_text']
y = all_data['Label']
cv.fit(X)
tfidf_vect.fit(X)
X_cv = cv.transform(X)
X_tfidf = tfidf_vect.transform(X)

In [None]:
X_train_cv, X_test_cv, y_train, y_test = train_test_split(X_cv, y, test_size=0.3, random_state=42)
X_train_tf, X_test_tf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [None]:
base_model = MultinomialNB()
base_model.fit(X_train_cv, y_train)
cv_predictions = base_model.predict(X_test_cv)
base_model_tf = MultinomialNB()
base_model_tf.fit(X_train_tf, y_train)
tfidf_predictions = base_model_tf.predict(X_test_tf)

In [None]:
cv_acc = accuracy_score(cv_predictions, y_test)
tfidf_acc = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc) )

In [None]:
print(classification_report(cv_predictions, y_test))

In [None]:
base_model1 = SVC()
base_model1.fit(X_train_cv, y_train)
cv_predictions1 = base_model1.predict(X_test_cv)
base_model1_tf = SVC()
base_model1_tf.fit(X_train_tf, y_train)
tfidf_predictions1 = base_model1_tf.predict(X_test_tf)

In [None]:
cv_acc1 = accuracy_score(cv_predictions, y_test)
tfidf_acc1 = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc1) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc1) )

In [None]:
print(classification_report(cv_predictions1, y_test))

In [None]:
base_model1 = DecisionTreeClassifier()
base_model1.fit(X_train_cv, y_train)
cv_predictions1 = base_model1.predict(X_test_cv)
base_model1_tf = DecisionTreeClassifier()
base_model1_tf.fit(X_train_tf, y_train)
tfidf_predictions1 = base_model1_tf.predict(X_test_tf)

In [None]:
cv_acc1 = accuracy_score(cv_predictions, y_test)
tfidf_acc1 = accuracy_score(tfidf_predictions, y_test)
print("Model Accuracy using Count Vectorizer is {}".format(cv_acc1) + \
      ": " +"Model accuracy using TFIDF is {}".format(tfidf_acc1) )

In [None]:
print(classification_report(cv_predictions1, y_test))