In [1]:
import os
import re
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [2]:
pos_dst = './Data/train/pos/'
neg_dst = './Data/train/neg/'

# No of positive samples
no_ps = len(os.listdir(pos_dst))

# No of negative samples
no_ns = len(os.listdir(neg_dst))

print("No of positive samples:", no_ps)
print("No of negative samples:", no_ns)

No of positive samples: 12500
No of negative samples: 12500


In [3]:
train_dict = {"pos":[],"neg":[]}

list_pos = os.listdir(pos_dst)
list_neg = os.listdir(neg_dst)
i=0
for pos,neg in zip(list_pos,list_neg):
    
    pos_f = open(os.path.join(pos_dst,pos),"r")
    neg_f = open(os.path.join(neg_dst,neg),"r")
    
    # Read the text
    pos_txt = pos_f.read()
    neg_txt = neg_f.read()
    
    pos_f.close()
    neg_f.close()
    
    train_dict["pos"].append(pos_txt)
    train_dict["neg"].append(neg_txt)

## Function to Process Text

In [9]:
def process(txt):
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # Removing hyperlinks
    txt = re.sub(r'https?:\/\/.*[\r\n]*', '', txt)
    
    # Only removing the hash # sign from the word
    txt = re.sub(r'#','', txt)
    
    # Removing the square brackets
    txt = re.sub('\[[^]]*\]','', txt)
    
    # Removing special characters and numbers
    pattern = r'[^a-zA-z\s]'
    txt = re.sub(pattern, '', txt)
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    txt_tokens = tokenizer.tokenize(txt)
    
    clean_txt = []
    
    for word in txt_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            stem_word = stemmer.stem(word)  # stemming word
            clean_txt.append(stem_word)
    
    
    
    return clean_txt

In [53]:
# Sample Positive Review
print("Positive sample:\n")
print(train[0],"\n")

# Processed positive sample 
print("Text After processing: \n")
print(process(train[0]),"\n")

# Sample Negative Review
print("Negative sample:\n")
print(train[12550],"\n")

# Processed Negative sample 
print("Text After processing: \n")
print(process(train[0]),"\n")

Positive sample:

You know the story - a group of plucky no-hopers enter a competition they seemingly have no chance of winning - it's a tale that has been done to death by Hollywood (Bring It On, The Karate Kid, Escape to Victory, Best of the Best etc). Now Korea gives it a go with a Taekwondo team struggling for glory  and guess what  the result is predictable but ultimately satisfying.<br /><br />The fact that this movie doesn't fall flat on its face is down to the talented young cast who really make you care about the characters, and this in turn keeps you watching to the end.<br /><br />Fans of your typical martial arts movie may be disappointed  Taekwondo does not deliver the usual flurry of moves and acrobatics seen in most Kung Fu films; the action is limited to (albeit impressive) kicking and the occasional punch. This doesn't matter though, since it is the interaction of the characters and their fight to make something of themselves which makes this movie a success. 

Text

##  Function to Build Frequencies

In [34]:
def build_freqs(tweets, ys):
    
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [56]:
# Training Set 
train_x = train_dict["pos"]+train_dict["neg"]

# Training Labels
train_y = np.append(np.ones((len(train_dict["pos"]),1)),
                    np.zeros((len(train_dict["neg"]), 1)),
                    axis=0)



# Print the no of samples
print("No of Training Samples:",len(train_x))

# Print the shape of train_y
print("train_y.shape = " + str(train_y.shape))

No of Training Samples: 25000
train_y.shape = (25000, 1)


In [35]:
# Create Frequency Dictionary
freqs = build_freqs(train_set, train_y)

In [72]:
print("Word: \"Bad\"")
print("No times used in the Positive sense:",freqs[('bad',1.0)])
print("No times used in the Negative sense:",freqs[('bad',0.0)],"\n")

print("Word: \"Amazing\"")
print("No times used in the Positive sense:",freqs[(process("Amazing")[0],1.0)])
print("No times used in the Negative sense:",freqs[(process("Amazing")[0],0.0)],"\n")

Word: "Bad"
No times used in the Positive sense: 1836
No times used in the Negative sense: 7120 

Word: "Amazing"
No times used in the Positive sense: 1153
No times used in the Negative sense: 364 

