In [2]:
!pip install vaderSentiment



In [3]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
import math
import csv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nltk.download("punkt")
pd.options.mode.chained_assignment = None  # default='warn'
sentiment_analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#Logistic Regression

Logistic regression learns from a training set, a vector of weights (one for each feature) and a bias.

To make a decission on a test (after training the model) the classifier multiplies each weight (w_i) by its feature (x_i), sums up the weighted features and adds the bias (also called intercept)

    z = x*w + b

note that **x** and **w** are vectors.

To create a probability we must pass *z* through the sigmoid function

    sigmoid(z) = 1/(1 + e^-z)
    
e^x is equal to the exponential of *-z*. The result is a number between 1 and 0, but we need to make sure that *sigmoid(z) + (1-sigmoid(z))* equals 0, meaning that the sum of all the probabilities is 1. A propierty of the sigmoid function is that 1-sigmoid(x) = sigmoid(-x).

#Logistic regression Model

##Classes and functions

In [208]:
class LogRegModel(nn.Module):
  def __init__(self, nfeatures, *args, **kwargs) -> None:
    super().__init__(*args, **kwargs)
    self.weights = nn.Parameter(torch.randn(nfeatures, requires_grad=True, dtype=torch.float64))
    self.bias = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float64))

  def forward(self, x: torch.Tensor)->torch.Tensor:
    return sigmoid(torch.sum(self.weights * x +self.bias, 1))

def sigmoid(x):
  return 1/(1+ math.e**-x)

def computeFeatures(x : np.ndarray):
  out = torch.zeros(len(x), 5)
  for i in range(len(x)):
    out[i][0] = sentiment_analyzer.polarity_scores(x[i])["pos"]
    out[i][1] = sentiment_analyzer.polarity_scores(x[i])["neg"]
    out[i][2] = 1 if " no " in x[i] else 0
    out[i][3] = len(re.findall("[\s.,](me|i|mine|you|your|my)[\s.,]", x[i]))+len(re.findall("^(me|i|mine|you|your)", x[i])) + len(re.findall("(me|i|mine|you|your)$", x[i]))
    out[i][4] = 1 if "!" in x[i] else 0
  return out

def computeSent(y):
  out = torch.zeros(len(y))
  for i in range(len(y)):
    if y[i] == "positive":
      out[i] = 1
  return out

def comparePredict(prediction, testdata):
    """
    Compares the predicted system output data to the 'gold standard' data.
    Returns a dictionary with the recall, precision, and accuracy inside.
    Dtypes should match
    """
    tp = 1
    tn = 1
    fp = 1
    fn = 1

    for i in range(len(prediction)):
      if prediction[i] == testdata[i]:
        if prediction[i] == 1.0:
          tp+=1
        else:
          tn+=1
      else:
        if prediction[i] == 1.0:
          fp+=1
        else:
          fn+=1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    acc = (tp+tn)/(tp+tn+fp+fn)
    print(f"""
          True positives = {tp-1}
          True negatives = {tn-1}
          False positives = {fp-1}
          False negatives = {fn-1}
          Precision = {precision}
          Recall = {recall}
          Accuracy = {(acc)*100}%
    """)
    return {"precision" : precision, "recall" : recall, "accuracy" : acc}

##Data preparation

In [77]:
data = pd.read_csv("/content/IMDB.csv", index_col=False, header=0, encoding="UTF-8", on_bad_lines="skip", engine="python")

for i in range(len(data)):
  data["review"][i] = re.sub("<br />", " ", data["review"][i])
  data["review"][i] = re.sub("\"", "'", data["review"][i])


In [170]:
reviews = np.array([i.lower() for i in data["review"].to_numpy()])
sent = np.array([float(0) if i == "negative" else float(1) for i in data["sentiment"].to_numpy()])

In [219]:
x = computeFeatures(reviews[:20000])
y = torch.from_numpy(sent[:20000])

In [220]:
split_point = int(0.8 * len(x))
x_train, y_train = x[0:split_point], y[0:split_point]
x_test, y_test = x[split_point:], y[split_point:]
len(x_train), len(y_train), len(y_test), len(x_test)

(16000, 16000, 4000, 4000)

##Model

In [221]:
torch.manual_seed(43)
modelLogReg = LogRegModel(5)
modelLogReg.state_dict()
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(params=modelLogReg.parameters(), lr=0.1)

##Training

In [222]:
epochs = 10000 #iterations over data

for i in range(epochs):
  modelLogReg.train()
  #1. Forward
  y_prediction = modelLogReg(x_train)
  #2. cost function
  loss = loss_fn(y_prediction, y_train)
  #3. Optimizer 0grad
  optimizer.zero_grad()
  #4. Backpropagation
  loss.backward()
  #5. Optimization (SGD)
  optimizer.step()

modelLogReg.state_dict()

OrderedDict([('weights',
              tensor([ 7.0218, -8.0628, -0.3910, -0.0108, -0.0546], dtype=torch.float64)),
             ('bias', tensor([-0.0106], dtype=torch.float64))])

##Testing

In [224]:
modelLogReg.eval()
test_pred = modelLogReg(x_test)
comparePredict(torch.round(test_pred), y_test)


          True positives = 1412
          True negatives = 1482
          False positives = 511
          False negatives = 595
          Precision = 0.734025974025974
          Recall = 0.7033349925335988
          Accuracy = 72.32767232767233%
    


{'precision': 0.734025974025974,
 'recall': 0.7033349925335988,
 'accuracy': 0.7232767232767233}