## Importing Libraries

In [243]:
import numpy as np
import pandas as pd
import glob
import os
import math
import random

## Making list of all the files

In [244]:
dir1 = "./Dataset/train/neg/"
dir2 = "./Dataset/train/pos/"
dir3 = "./Dataset/test/neg/"
dir4 = "./Dataset/test/pos/"
train_neg = os.listdir(dir1)
train_pos = os.listdir(dir2)
test_neg = os.listdir(dir3)
test_pos = os.listdir(dir4)

## Making list of Positive and negative words

In [245]:
dir5 = "./Dataset/negative-words.txt"
dir6 = "./Dataset/positive-words.txt"
f_5= open(dir5, "r", encoding = "latin-1")
f_6 = open(dir6, "r")
neg_words  = f_5.read()
pos_words = f_6.read()
neg_words = neg_words.split()
pos_words = pos_words.split()

## Pre-processing function

In [246]:
def fileread(dir_, filenames, pos_words, neg_words) :
    ret = []
    for i,val in enumerate(filenames) :
        #print(i)
        f = open(dir_+filenames[i], "r")
        data = f.read()
        data = data.lower()
        words = data.split()
        count = len(words)
        pos_count = 0
        neg_count = 0
        x_5 = 0
        x_6 = 0
        for j in words :
            if j in pos_words :
                pos_count+=1
            if j in neg_words :
                neg_count+=1
            if j == 'no':
                x_5 = 1
            if j == '!' :
                x_6 = 1
        string = filenames[i].split('_')
        rating = int(string[1].split('.')[0])
        temp = [pos_count, neg_count, rating, math.log(count), x_5, x_6]
        ret.append(temp)
        
    return ret
        

## Pre-processing all files 
### Reading 10000 training and 2000 test files as otherwise they were taking too much time

In [105]:
train_neg_pd = fileread(dir1, train_neg[0:5000], pos_words, neg_words)

In [106]:
train_neg_labels = [0]*5000

In [107]:
train_pos_pd = fileread(dir2, train_pos[0:5000], pos_words, neg_words)

In [108]:
train_pos_labels = [1]*5000

In [109]:
test_neg_pd = fileread(dir3, test_neg[0:1000], pos_words, neg_words)

In [110]:
test_neg_labels = [0]*1000

In [112]:
test_pos_pd = fileread(dir4, test_pos[0:1000], pos_words, neg_words)

In [113]:
test_pos_labels = [1]*1000

## Sigmoid 

In [114]:
def sigmoid(inp) :
    return (1/(1+np.exp(inp)))

## Cross Entropy Loss

In [189]:
def CEL(y, y_hat) :
    loss = 0
    for i, val in enumerate(y):
        try :
            temp = -((y[i]*math.log(y_hat[0][i]))+((1-y[i])*math.log(1-y_hat[0][i])))
            loss+= 0.0001*temp
        except :
            continue
            #print(y_hat[0][i])
    return loss

## Shuffling train_data

In [152]:
train_data = train_neg_pd + train_pos_pd
train_labels = train_neg_labels + train_pos_labels
c = list(zip(train_data, train_labels))
random.shuffle(c)
x, y = zip(*c)

## Prediction function

In [185]:
def predict(train_data, weights):
    length = len(train_data)
    train_data = np.array(train_data).reshape(length, 6)
    z = np.dot(train_data, np.transpose(weights))
    return sigmoid(-z)

## Weight update Batch

In [206]:
def weight_update(x,y, y_hat, lr = 0.0001 ) :
    for i,val in enumerate(x) :
        global weights
        delta_w = np.multiply([y_hat[0][i]-y[i]],x[i])
        weights -= lr*delta_w 

## Weight updtae stochastic

In [234]:
def weight_update_stochastic(x,y, lr = 0.0001 ) :
    length = len(x)
    for i in range(length):
        global weights_stochastic
        z = np.dot(x[i], np.transpose(weights_stochastic))
        y_hat =  sigmoid(-z)
        y_hat
        temp2 = y_hat-y[i]
        temp = np.multiply(temp2,x[i])
        weights_stochastic -= lr*temp

## Training Batch
### For 100 epochs


In [247]:
weights = np.random.rand(1,6)
epochs = 10
for i in range(100) :
    y_hat = predict(x, weights)
    y_hat = y_hat.reshape(1,len(x))
    accuracy = 0
    for j,val in enumerate(y) :
        if y_hat[0][j] <=0.5 and val == 0 :
            accuracy +=1
        if y_hat[0][j]>0.5 and val == 1 :
            accuracy +=1
    weight_update(x,y, y_hat)


## Training Stochastic
### For 100 epochs

In [250]:
weights_stochastic = np.random.rand(1,6)
epochs = 10
for i in range(100) :
    weight_update_stochastic(x,y)
    y_hat = predict(x, weights_stochastic)
    y_hat = y_hat.reshape(1,len(x))
    accuracy = 0
    for j,val in enumerate(y) :
        if y_hat[0][j] <=0.5 and val == 0 :
            accuracy +=1
        if y_hat[0][j]>0.5 and val == 1 :
            accuracy +=1       

## Shuffle Test data

In [248]:
test_data = test_neg_pd + test_pos_pd
test_labels = test_neg_labels + test_pos_labels
c_test = list(zip(test_data, test_labels))
random.shuffle(c_test)
x_test, y_test = zip(*c_test)

## Prediction Batch

In [263]:
y_ = predict(x_test, weights)
y_ = y_.reshape(1,len(x_test))
TP = 0
TN = 0
FP = 0
FN = 0
for j,val in enumerate(y_test) :
    if y_[0][j] <=0.5 and val == 0 :
        TN +=1
    if y_[0][j]>0.5 and val == 1 :
        TP +=1
    if y_[0][j] <=0.5 and val == 1 :
        FN+=1
    if y_[0][j] > 0.5 and val == 0 :
        FP+=1
print(TP)
print(TN)
print(FP)
print(FN)

998
997
3
2


## Prediction Stochastic

In [262]:
y_ = predict(x_test, weights_stochastic)
y_ = y_.reshape(1,len(x_test))
TP = 0
FP = 0
FN = 0
TN = 0
for j,val in enumerate(y_test) :
    if y_[0][j] <=0.5 and val == 0 :
        TN +=1
    if y_[0][j]>0.5 and val == 1 :
        TP +=1
    if y_[0][j] <=0.5 and val == 1 :
        FN+=1
    if y_[0][j] > 0.5 and val == 0 :
        FP+=1
print(TP)
print(TN)
print(FP)
print(FN)

1000
999
1
0


# Evaluation (for 2000 dataset examples):

| Batch Gradient Descent| Pos | Neg |
| --- | --- | --- |
| Pos | 998 | 3   |
|Neg  | 2   | 997 |

| Stochastic Gradient Descent| Pos | Neg |
| --- | --- | --- |
| Pos | 1000 | 1   |
|Neg  | 0   | 999 |

## Batch Gradient Descent  
### Precision = 99.7%
### Recall = 99.8%
### Accuracy = 99.75%
### F1-score = 0.499

## Stochastic Gradient Descent  
### Precision = 99.9%
### Recall = 100%
### Accuracy = 99.95%
### F1-score = 0.5