In [3]:
import nltk
import string

from nltk import word_tokenize
from nltk import pos_tag

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stopwords = list(stopwords.words('english'))
punctuation_translator = str.maketrans('', '', string.punctuation)


def remove_punctuation(s):
    return s.translate(punctuation_translator)



def preprocess_text(s, tokenizer=None, remove_stopwords=True, remove_punctuation=True,
                    stemmer=None, lemmatizer=None, lowercase=True, return_type='str'):
    # Throw an error if both stemmer and lemmatizer are not None
    if stemmer is not None and lemmatizer is not None:
         raise ValueError("Stemmer and Lemmatizer cannot both be not None!")

    # Tokenization either with default tokenizer or user-specified tokenizer
    if tokenizer is None:
        token_list = word_tokenize(s)
    else:
        token_list = tokenizer.tokenize(s)

    # Stem or lemmatize if needed
    if lemmatizer is not None:
        token_list = lemmatize_token_list(lemmatizer, token_list)
    elif stemmer is not None:
        token_list = stem_token_list(stemmer, token_list)

    # Convert all tokens to lowercase if need
    if lowercase:
        token_list = [ token.lower() for token in token_list ]

    # Remove all stopwords if needed
    if remove_stopwords:
        token_list = [ token for token in token_list if not token in nltk_stopwords ]

    # Remove all punctuation marks if needed (note: also converts, e.g, "Mr." to "Mr")
    if remove_punctuation:
        token_list = [ ''.join(c for c in s if c not in string.punctuation) for s in token_list ]
        token_list = [ token for token in token_list if len(token) > 0 ] # Remove "empty" tokens

    if return_type == 'list':
        return token_list
    elif return_type == 'set':
        return set(token_list)
    else:
        return ' '.join(token_list)



def stem_token_list(stemmer, token_list):
    for idx, token in enumerate(token_list):
        token_list[idx] = stemmer.stem(token)
    return token_list


def lemmatize_token_list(lemmatizer, token_list):
    pos_tag_list = pos_tag(token_list)
    for idx, (token, tag) in enumerate(pos_tag_list):
        tag_simple = tag[0].lower() # Converts, e.g., "VBD" to "c"
        if tag_simple in ['n', 'v', 'j']:
            word_type = tag_simple.replace('j', 'a')
        else:
            word_type = 'n'
        lemmatized_token = lemmatizer.lemmatize(token, pos=word_type)
        token_list[idx] = lemmatized_token
    return token_list



#
# Everything below gets only executed when the file is explicitly being run
# and not when imported. This is useful for testing the functions.
#

   

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HARSHITA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import numpy as np
import pandas as pd
import random

from nltk import bigrams
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [6]:
# Load files using pandas
df_sent_pos = pd.read_csv('rt-polarity.pos', sep='\t', header=None)
df_sent_neg = pd.read_csv('rt-polarity.neg', sep='\t', header=None)

# Create a list for all sentences and ad the sentences from both read files
sentences = []
sentences.extend(df_sent_neg[0].tolist())
sentences.extend(df_sent_pos[0].tolist())

# Preprocess sentences (by default, we only lowercase all letter and remove topwords and punctuation)
sentences_preprocessed = [''] * len(sentences)
for idx, sent in enumerate(sentences):
    sentences_preprocessed[idx] = preprocess_text(sent)

# Create a list for all lables
polarities = []
polarities.extend([0]*len(df_sent_neg))
polarities.extend([1]*len(df_sent_pos))

# Convert from lists to numpy arrays
sentences = np.array(sentences_preprocessed)
polarities = np.array(polarities)

# Shuffle sentences and labels
combined = list(zip(sentences, polarities))
random.seed(1) # (optional)
random.shuffle(combined)
# split the "zipped" list into the two lists of sentences and labels/polarities
sentences[:], polarities[:] = zip(*combined)

# Let's go for a 80%/20% split -- you can change the value anf see its effects
train_test_ratio = 0.8

# Calculate the size of the training data (the size of the dest data is also implicitly given)
train_set_size = int(train_test_ratio * len(sentences))

# Split data and labels into training and test data with respect to the size of the test data
X_train, X_test = sentences[:train_set_size], sentences[train_set_size:]
y_train, y_test = polarities[:train_set_size], polarities[train_set_size:]

print("Size of training set: {}".format(len(X_train)))
print("Size of test: {}".format(len(X_test)))

Size of training set: 8529
Size of test: 2133


In [15]:
vocabulary = set()
log_class_priors ={}
token_counts ={'pos':{},'neg':{}}


In [16]:
def get_token_counts(token_list):
    token_counts={}
    for token in token_list:
        token_counts[token]=token_counts.get(token,0.0) +1
    return token_counts
    

In [17]:
token_list = X_train[-1].split()
print(get_token_counts(token_list))

{'lee': 1.0, 'marvelously': 1.0, 'compelling': 1.0, 'present': 1.0, 'brown': 2.0, 'catalyst': 1.0, 'struggle': 1.0, 'black': 1.0, 'manhood': 1.0, 'restrictive': 1.0, 'chaotic': 1.0, 'america': 1.0, 'sketchy': 1.0, 'nevertheless': 1.0, 'gripping': 1.0, 'portrait': 1.0, 'jim': 1.0, 'celebrated': 1.0, 'wonder': 1.0, 'spotlight': 1.0}


In [19]:
def Fit(X,y):
    num_data_items = len(X)
    log_class_priors['pos']=np.log(sum(1 for label in y if label==1)/num_data_items)
    log_class_priors['neg']=np.log(sum(1 for label in y if label==0)/num_data_items)

    for doc,label in zip(X,y):
        polarity_class = 'pos' if label==1 else 'neg'
        counts = get_token_counts(doc.split())
        for token,count in counts.items():
            vocabulary.add(token)
            if token not in token_counts[polarity_class]:
                token_counts[polarity_class][token]=0
            token_counts[polarity_class][token]+=count
                
        

In [20]:
Fit(X_train,y_train)

In [21]:
print(log_class_priors)

{'pos': -0.6930299403933299, 'neg': -0.693264434473429}


In [28]:
print("Priors(log Probabilities): {}".format(log_class_priors))
print("Priors(probabilities): {}".format( {k:np.exp(v) for k,v in log_class_priors.items()}))

Priors(log Probabilities): {'pos': -0.6930299403933299, 'neg': -0.693264434473429}
Priors(probabilities): {'pos': 0.5000586235197562, 'neg': 0.4999413764802439}


In [29]:
token_1 = 'good'
token_2 = 'bad'

In [30]:
print("Number of occurences of {} in class POSITIVE: {}".format(token_1,token_counts['pos'][token_1]))

Number of occurences of good in class POSITIVE: 156.0


In [31]:
print("Number of occurences of {} in class POSITIVE: {}".format(token_2,token_counts['pos'][token_2]))

Number of occurences of bad in class POSITIVE: 22.0


In [32]:
print("Number of occurences of {} in class NEGATIVE: {}".format(token_1,token_counts['neg'][token_1]))

Number of occurences of good in class NEGATIVE: 137.0


In [34]:
print("Number of occurences of {} in class NEGATIVE: {}".format(token_2,token_counts['neg'][token_2]))

Number of occurences of bad in class NEGATIVE: 165.0


In [35]:
def predict(X):
    y_pred = []
    
    # Loop over all test sample and predict class label for each sample
    for doc in X:
        # Initialize class scores (i.e., log probablities)
        pos_score, neg_score = 0, 0
        # Get the number of occurrences of each token in the document
        counts = get_token_counts(doc.split())
        for token, _ in counts.items():
            # Ignore unknown tokens
            if token not in vocabulary: 
                continue
                
            # Add Laplace smoothing
            log_w_given_pos = np.log( (token_counts['pos'].get(token, 0.0) + 1) / (sum(token_counts['pos'].values()) + len(vocabulary)) )
            log_w_given_neg = np.log( (token_counts['neg'].get(token, 0.0) + 1) / (sum(token_counts['neg'].values()) + len(vocabulary)) )
 
            # Update class scores
            pos_score += log_w_given_pos # Since we are dealing with log probabilities here
            neg_score += log_w_given_neg # we need to add (and not multiply) the values
 
        # Include priors in class scores
        pos_score += log_class_priors['pos'] # Since we are dealing with log probabilities here
        neg_score += log_class_priors['neg'] # we need to add (and not multiply) the values
 
        if pos_score > neg_score:
            y_pred.append(1)
        else:
            y_pred.append(0)
            
    # Return list of predicted class labels
    return y_pred

In [39]:
predict(['Nice Movie','Not a good watch'])

[1, 1]

In [40]:
y_pred = predict(X_test)

In [41]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1067
           1       0.77      0.78      0.78      1066

    accuracy                           0.77      2133
   macro avg       0.77      0.77      0.77      2133
weighted avg       0.77      0.77      0.77      2133



In [42]:
print("Accuracy: {:.3f}".format(accuracy_score(y_test,y_pred)))

Accuracy: 0.774
