In [5]:
#libraries
import numpy as np
import matplotlib.pyplot as plt 
import re
import math
import os
import random

### Logistic Regression from Scratch for Sentiment Analysis


### Implementation of Cost Function, Gradient descent and Stochastic gradient for multivariate linear regression

In [6]:
# load data
dataset_dir = 'Dataset'

In [7]:
#preprocessing
def preprocessing(directory, dataset):
    print('Loading for', dataset)
    print('Loading Lexicon...')
    pos_words_lex = np.loadtxt("Dataset/positive-words.txt", dtype='str') #numpy function to read text and seperate them by ,
    neg_words_lex = np.loadtxt("Dataset/negative-words.txt", dtype='str')
    
    print('Loading files...')
    #os.walk is used to traverse files and get the abosulte path of the file
    #the absolute path are later used to read the data and build dataset
    for root, dirs, files in os.walk(os.path.abspath((os.path.join(dataset_dir, dataset,'pos')))): #for pos review
        files_pos = [os.path.join(root, file) for file in files]

    for root, dirs, files in os.walk(os.path.abspath((os.path.join(dataset_dir, dataset,'neg')))): #for neg review
        files_neg = [os.path.join(root, file) for file in files]

    #generate y based on neg and pos reiew files
    y = np.concatenate([np.ones((len(files_pos),1)), np.zeros((len(files_neg),1))]) 
    print('Loading data from files...')

    data_pos       = [np.loadtxt(i, dtype='str', encoding='utf-8', comments="!@#!@$5464") for i in files_pos]
    data_neg       = [np.loadtxt(i, dtype='str', encoding='utf-8', comments="!@#!@$5464") for i in files_neg]

    #concatenate the pos and neg reviews to 1 array
    print('Concatenating the negative and positive reviews...')
    data    = np.concatenate([data_pos, data_neg])

    print('Cleaning data...')
    for rev in data:
        for idx, i in enumerate(rev):
            rev[idx] =  i.replace(")","").replace("(", "").replace("/>", "").replace(".","")

    print('Following are the words in both negative and positive lex')
    set(neg_words_lex).intersection(set(pos_words_lex))

    print('\nCounting negative and positive words...')  
    pos_count = np.zeros(len(data))
    neg_count = np.zeros(len(data))
    
    #enumerate means iterate data in array with index , idx
    for idx, review in enumerate(data):
        for i in review:
            if i in pos_words_lex:
                pos_count[idx] += 1
            elif i in neg_words_lex:
                neg_count[idx] += 1    

    print('Building dataset from features...')
    x1 = pos_count
    x2 = neg_count
    x3 = [int(os.path.basename(i).split('_')[1][0]) for i in files_pos]+[int(os.path.basename(i).split('_')[1][0]) for i in files_neg]
    x4 = [math.log(len(x)) for x in data] 

    x5 = [1 if 'no'in i else 0 for i in data]

    x6 = np.zeros(len(data))
    for idx, i in enumerate(data):
        for j in i:
            if "!" in j:
                x6[idx] = 1
            break
    #make dataset in matrix for 
    #25000 x 6 matrix, each row represent each datapoint/review and every column is a feature
    dataset = np.vstack([x1,x2,x3,x4,x5,x6]).T
    
    means_X = np.mean(dataset,axis=0)
    sd_X = np.std(dataset,axis = 0)
    for i in range (0,len(dataset)):
        for j in range(0,6):
            dataset[i][j]= (dataset[i][j]-means_X[j])/sd_X[j]

    return dataset, y


In [31]:
#calling preprocessing for train and test data
train_data, y_train = preprocessing(dataset_dir, 'train')
test_data, y_test  = preprocessing(dataset_dir, 'test')

Loading for train
Loading Lexicon...
Loading files...
Loading data from files...
Concatenating the negative and positive reviews...
Cleaning data...




Following are the words in both negative and positive lex

Counting negative and positive words...
Building dataset from features...
Loading for test
Loading Lexicon...
Loading files...
Loading data from files...
Concatenating the negative and positive reviews...
Cleaning data...
Following are the words in both negative and positive lex

Counting negative and positive words...
Building dataset from features...


In [32]:
# adding x0 so we can easily compute THETA0
x0=np.ones((len(y_train),1) , dtype = float)
x1=np.ones((len(y_test),1) , dtype = float)
X=np.column_stack((x0,train_data))
Xtest=np.column_stack((x1,test_data))
print("no of columns in data",len(X[0]))
print("no of rows in data",len(X))


no of columns in data 7
no of rows in data 25000


In [33]:
#sigmoid function
def sigmoid(x):
    z = 1/(1 + np.exp(-x))
    return z

In [11]:
#predict function
def predict(X,W):
    hx=np.dot(X,W)
    return hx

The sigmoid

$$ \hat{y} = \frac{e^{-(w.x+b)}}{1+ e^{-(w.x+b)}} $$

In [12]:
#converting values of predict into sigmoid
def aftersig(X,W):
    prob=sigmoid(predict(X,W))
    return prob

Cross Entropy(Loss Function for logistic Regression)

$$L_{CE}(w,b) = -[ylog\sigma(w.x+b) + (1-y) log(1-\sigma(w.x+b))]$$


In [13]:
#cross entropy
def cross_entropy(W,X,Y):
    
    m = np.size(Y) 
    mul=aftersig(X,W)

    loss = [-((Y[i]*np.log(mul[i]))+((1-Y[i])*np.log(1-mul[i]))) for i in range(m)]
   
    final= sum(loss)/m
    return final

Paramenters need to be learned by Model are the $\theta_j$ values. These are
the values that will be adjusted to minimize cost $J(\theta)$.
One way to do this is to use the gradient descent algorithm. 

$$ \theta_j = \theta_j - \alpha \frac{1}{m} (\sigma(w.x+b)-y)x_j  $$

In [14]:
#gradient descent batch
def gradientbatch(X, Y, alpha, n_epoch):

    features= np.size(X,1) # no of total features
    m= np.size(Y)  # number of training examples
    

    cost = list()  # list to store cost for every epoch
    
    thetas = np.zeros((7,1) , dtype = float)  #intiliazes thetas
    
    for epoch in range (n_epoch):
        
        sig = aftersig(X, thetas) #predict y
        loss=sig-Y                #Calculate Loss
        
        for i in range(features):
            
            multiply = np.multiply(loss.T,X[:,i])
            summation =  np.sum(multiply)
            thetas[i] = thetas[i] - ((alpha/m)*summation)
            
        cost.append(cross_entropy(thetas,X, Y))
       
    return thetas, cost
    

In [15]:
#calling of batch
n_epoch= 1500
alpha=0.01
thetas, cost = gradientbatch(X, y_train, alpha, n_epoch)
print("weights from batch",thetas)
print("cost from batch", cost[-1])


weights from batch [[ 0.05640649]
 [ 0.75009815]
 [-0.74175489]
 [ 1.09008397]
 [-0.05813787]
 [-0.22691874]
 [-0.01876693]]
cost from batch [0.46402699]


In [36]:
#gradient stochastic
def gradientStochastic (X, Y, alpha, n_epoch):
    
    
    features= np.size(X,1)
    m= np.size(Y)  # number of training examples]
    
    cost = list()  # list to store cost for every epoch
    
    thetas = np.zeros((X.shape[1],1), dtype = float)
    
    for i in range (n_epoch):
        
        irand =  random.randint(0, (m-1)) # random no to pick data row
        
        sig = aftersig(X[irand], thetas) #predict Y
        loss=sig-Y[irand]                # loss
        
        for i in range(features):
            
            multiply = np.multiply(loss,X[irand,i])
            thetas[i] = thetas[i] - (alpha*multiply)
        
        cost.append(cross_entropy(thetas,X, Y))
    
    return thetas, cost
  

In [37]:
#calling of stochastic
no_iteration= 1500
alpha=0.01
thetasScro, costScro = gradientStochastic(X, y_train, alpha, no_iteration)
print("weights from stochastic",thetasScro)
print("cost from stochastic", costScro[-1])

weights from stochastic [[ 0.11378163]
 [ 0.77413188]
 [-0.91446385]
 [ 1.03327624]
 [-0.10667004]
 [-0.19081466]
 [-0.06897519]]
cost from stochastic [0.46317527]


In [38]:
#prediction for testdata for batch
hx=aftersig(Xtest,thetas)
predictionB = [1 if x >0.5 else 0 for x in hx]
#print(prediction)

In [39]:
#prediction for testdata for stochastic
hx=aftersig(Xtest,thetasScro)
predictionS = [1 if x >0.5 else 0 for x in hx]

In [40]:
y_test = y_test.astype(np.float)

In [41]:
# function for evalution
def perf_measure(y_actual, y_pred):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_pred)): 
        if y_actual[i]==y_pred[i]==1:
            TP += 1
        if y_pred[i]==1 and y_actual[i]==0:
            FP += 1
        if y_actual[i]==y_pred[i]==0:
            TN += 1
        if y_pred[i]==0 and y_actual[i]==1:
            FN += 1
    return(TP, FP, TN, FN)

In [42]:
#Batch Gradeint Results
TP,FP,TN,FN=perf_measure(y_test,predictionB)
accuracy= (TN + TN)/(TP+TN+FP+FN)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
f1=2*((precision*recall)/(precision+recall))
confusionMatrix = np.array([[TP, FP], 
    [FN, TN]])

print("Batch Gradeint Results")
print("Confusion Matrix:",confusionMatrix)
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Batch Gradeint Results
Confusion Matrix: [[ 8264  2036]
 [ 4236 10464]]
Accuracy:  0.83712
Precision:  0.8023300970873787
Recall:  0.66112
F1 Score:  0.7249122807017544


In [43]:
#Stocastic Gradient Results
TP,FP,TN,FN=perf_measure(y_test,predictionS)
accuracy= (TN + TN)/(TP+TN+FP+FN)
precision=TP/(TP+FP)
recall=TP/(TP+FN)
f1=2*((precision*recall)/(precision+recall))
confusionMatrix = np.array([[TP, FP], 
    [FN, TN]])
print("Stocastic Gradient Results")

print("Confusion Matrix:",confusionMatrix)
print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Stocastic Gradient Results
Confusion Matrix: [[8747 2526]
 [3753 9974]]
Accuracy:  0.79792
Precision:  0.7759247760134835
Recall:  0.69976
F1 Score:  0.7358768350649898
