In [3]:
import os
import re
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
pos_dst = './Data/test/pos/'
neg_dst = './Data/test/neg/'

# No of positive samples
no_ps = len(os.listdir(pos_dst))

# No of negative samples
no_ns = len(os.listdir(neg_dst))

print("No of positive samples:", no_ps)
print("No of negative samples:", no_ns)

No of positive samples: 12500
No of negative samples: 12500


In [5]:
train_dict = {"pos":[],"neg":[]}

list_pos = os.listdir(pos_dst)
list_neg = os.listdir(neg_dst)
i=0
for pos,neg in zip(list_pos,list_neg):
    
    pos_f = open(os.path.join(pos_dst,pos),"r")
    neg_f = open(os.path.join(neg_dst,neg),"r")
    
    # Read the text
    pos_txt = pos_f.read()
    neg_txt = neg_f.read()
    
    pos_f.close()
    neg_f.close()
    
    train_dict["pos"].append(pos_txt)
    train_dict["neg"].append(neg_txt)

## Function to Process Text

In [6]:
def process(txt):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # Removing hyperlinks
    txt = re.sub(r'http\S+', '', txt)
    
    # Removing Line breaks
    txt = re.sub(r'<br />', ' ', txt)
    
    # Only removing the hash # sign from the word
    txt = re.sub(r'#','', txt)
    
    # Removing text.text types
    match = re.search(r'[a-zA-Z]*\.[a-zA-Z]*', txt)
    while(match != None):
        replace = " ".join((txt[match.start():match.end()].split(".")))
        txt = re.sub(r'[a-zA-Z]*\.[a-zA-Z]*',replace, txt, 1)
        match = re.search(r'[a-zA-Z]*\.[a-zA-Z]*', txt)
    
    # Removing special characters and numbers
    pattern = r'[^a-zA-z\s]'
    txt = re.sub(pattern, ' ', txt)
    
    txt_tokens = nltk.word_tokenize(txt)  
    
    clean_txt = []
    
    for word in txt_tokens:
        word = word.lower()
        word = word.strip(" ")
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            stem_word = stemmer.stem(word.strip('_'))# stemming word
            clean_txt.append(stem_word)
    
    
    
    return clean_txt

In [9]:
# Sample Positive Review
print("Positive sample:\n")
print(train_dict["pos"][0],"\n")

# Processed positive sample 
print("Text After processing: \n")
print(process(train_dict["pos"][0]),"\n")

# Sample Negative Review
print("Negative sample:\n")
print(train_dict["neg"][0],"\n")

# Processed Negative sample 
print("Text After processing: \n")
print(process(train_dict["neg"][0]),"\n")

Positive sample:

I loved this movie. It's a lot of laughs. The acting is good and the writing is really sharp. I'd rather see a hundred movies like this than THREE LORD OF THE RINGS repeating and repeating themselves.<br /><br />It's a low budget affair and seems to be shot on DV but looks good and Jay Mohr and Julianne Nicholson are great together. Why do you have a ten line minimum? I'm not a critic, just a patron.<br /><br />I doubt very much that Quentin Tarantino could write a picture this funny without filling it with masturbatory gratuitous violence. This movie should be seen on more screens than just one. I laughed from beginning to end. > 

Text After processing: 

['love', 'movi', 'lot', 'laugh', 'act', 'good', 'write', 'realli', 'sharp', 'rather', 'see', 'hundr', 'movi', 'like', 'three', 'lord', 'ring', 'repeat', 'repeat', 'low', 'budget', 'affair', 'seem', 'shot', 'dv', 'look', 'good', 'jay', 'mohr', 'juliann', 'nicholson', 'great', 'togeth', 'ten', 'line', 'minimum', 'cri

##  Function to Build Frequencies

In [10]:
def build_freqs(tweets, ys):
    
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [11]:
# Training Set 
train_x = train_dict["pos"]+train_dict["neg"]

# Training Labels
train_y = np.append(np.ones((len(train_dict["pos"]),1)),
                    np.zeros((len(train_dict["neg"]), 1)),
                    axis=0)



# Print the no of samples
print("No of Training Samples:",len(train_x))

# Print the shape of train_y
print("train_y.shape = " + str(train_y.shape))

No of Training Samples: 25000
train_y.shape = (25000, 1)


In [12]:
# Create Frequency Dictionary
freqs = build_freqs(train_x, train_y)

In [13]:
import pickle 

# Saving the Frequency Dictionary
file = open('FreqDict', 'wb') 
pickle.dump(freqs, file) 
file.close() 

In [15]:
# Loading the Saved Frequency Dictionary

file_to_read = open("FreqDict", "rb")

sfreqs = pickle.load(file_to_read)

In [16]:
print("Word: \"Bad\"")
print("No times used in the Positive sense:",sfreqs[('bad',1.0)])
print("No times used in the Negative sense:",sfreqs[('bad',0.0)],"\n")

print("Word: \"Amazing\"")
print("No times used in the Positive sense:",sfreqs[(process("Amazing")[0],1.0)])
print("No times used in the Negative sense:",sfreqs[(process("Amazing")[0],0.0)],"\n")

Word: "Bad"
No times used in the Positive sense: 1840
No times used in the Negative sense: 7353 

Word: "Amazing"
No times used in the Positive sense: 1077
No times used in the Negative sense: 366 



In [17]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process(tweet)
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
        
    assert(x.shape == (1, 3))
    return x

In [18]:
# Converting each sample to 1x3 feature matrix

X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], sfreqs)

In [4]:
# Saving the Features
# np.save('TFeatures.npy', X) 
# # Saving the Labels
# np.save('Tlabels.npy', train_y) 

# Loading the Saved Features
Features = np.load('TFeatures.npy') 
labels = np.load('Tlabels.npy').ravel()

In [15]:
x_train, x_test, y_train, y_test = train_test_split(Features, labels, test_size=0.25, random_state=0)

In [16]:
LR = LogisticRegression(max_iter=500)

In [17]:
LR.fit(x_train, y_train)

LogisticRegression(max_iter=500)

In [18]:
print("Training F1 Score:", f1_score(y_train, LR.predict(x_train), average='micro'))
print("Testing F1 Score:", f1_score(y_test, LR.predict(x_test),average='micro'))

Training F1 Score: 0.7032
Testing F1 Score: 0.69792
