In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
import math
import csv
nltk.download("punkt")
pd.options.mode.chained_assignment = None  # default='warn'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#Naive Bayes

El teorema de bayes es:

    P(x dado y) = (P(y and x)*P(x))/P(y)

Tenemos que asumir que podemos representar x como un conjunto de caracteristicas, por lo que:

    P(y|x) = (P(f1,f2,f3... fn | y)*P(x))/P(y)
    P(f1,f2,f3... fn | y) = P(f1 | y) * P(f2 | y) * P(f3 | y)...P(fn | y)

Estas caracteristicas pueden ser las palabras de la que consta un documento que queremos clasificar, por ejemplo, y asumimos que estas caracteristicas o features son independientes entre si para poder computarlo. Muchas veces se usa el sumatorio de los logaritmos porque si se usa el producto salen numeros muy pequeños.

P(x) es el numero de elementos de cada clase partido por el numero de elementos en general o el numero de muestras.

P(fx | clase) es el numero de casos que se da esa caractyeristica dentro de la clase partido del sumatorio de todas las caracteristicas de cada clase, en nuestro caso es el numero de veces que ocurre una palabra dentro de una clase partido del numero de palabras de la clase (al menos de las relevantes para las features)

Ya en la parte de testeo, para averiguar a que clase pertenece un elemento se hace el producto de todas las caracteristicas del elemento para cada clase y se multiplica por la probabilidad de cada clase y el que sea mayor es el que tiene mayor probabilidad.

Nota: En caso de hacerlo en un espacio logaritmico habria que hacer el sumatorio en lugar del producto.

##Binary naive bayes
Es una alternativa que, al parecer, mejora en sentiment analysis. Se basa en la idea de que para este analisis es mas relevante el hecho de que una palabra ocurra frente a cuantas veces ocurre la palabra, por lo que limita las veces que se cuenta esa palabra en una misma muestra a 1, es decir, si tenemos una muestra que es "La pelicula tiene una gran direccion y una gran fotografia" la palabra "gran" solo se cuenta una vez en esta muestra.

Tambien tiene en cuenta la negación y lo que se niega, de forma que si, por ejemplo, se niega algo malo pasa a ser algo bueno, es decir, "not bad" no se toma como dos tokens ("not" y "bad") si no como uno solo "NOT_bad".

Cabe mencionar que para la clasificacion de textos se pueden usar lexicones anotados, que tienen listas de palabras relacionadas con su percepción positiva o negativa.

##Evaluación del modelo

Para evaluar el modelo se usan la precision, recall y F-measure.

1. La precision se refiere al ratio de positivos verdaderos frente a todos los positivos dados por el modelo, es decir, que fraccion de los positivos son verdaderos.

2. El recall se refiere al ratio entre positivos verdaderos entre todos los positivos de los datos anotados

#Classes

In [None]:
class NaiveBayesModel():
  def __init__(self, data):
    self.data = self.computeData(data)
    self.p_c = self.classProbability(data)

  def classProbability(self, data):
    """
    Returns the prior probability, the probability of a random element belonging to each class, as a dictionary
    """
    out = {}
    for i in data:
      if i[1] not in out.keys():
        out.update({i[1]:1})
      else:
        out[i[1]]+=1
    n = sum(out.values())
    for k in out.keys():
      out[k] /= n
    return out

  def computeProb(self, testdata, log = True):
    """
    Default method when the object is called
    Returns the probability of the input belonging to each class
    """
    probabilities = {}
    if log:
      for c in self.data.columns:
        probabilities.update({c : 1})
        for w in re.sub("[^\w\s]", "", testdata).lower().split():
          if w in self.data.index:
            probabilities[c] += np.log(self.data[c][w])
        probabilities[c] += np.log(self.p_c[c])
    else:
      for c in self.data.columns:
        probabilities.update({c : 1})
        for w in re.sub("[^\w\s]", "", testdata).lower().split():
          if w in self.data.index:
            probabilities[c] *= self.data[c][w]
        probabilities[c] *= self.p_c[c]
    return probabilities

  def computeData(self, data):
    """
    Data must be in a list of tuples with shape (n, 2).
    The index one of the tuple must be the review and the second the sentiment
    Measures the probability of each word given the class and returns a dataframe.
    The calculation follows the basic naive bayes algorithm and uses Laplace smoothing
    """
    out = pd.DataFrame()
    for i in data:
      if i[1] not in out.columns:
        out.insert(len(out.columns), i[1], 0)

      for w in re.sub("[^\w\s]", "", i[0]).lower().split():
        if w not in out.index:
          dic = {w: [0 for i in out.columns]}
          out.loc[w] = float(0)
        out[i[1]][w] += 1

    out += 1

    for c in out.columns:
      out[c] = out[c]/sum(out[c])

    return out #out.apply(np.log) si queremos hacer el logaritmo de un DataFrame

  def debug(self): #Simple debug
    print(self.data)
    print()
    print(self.p_c)

  __call__=computeProb

class BinaryNaiveBayes(NaiveBayesModel):
  def __init__(self, data):
    super().__init__(data)

  def computeData(self, data):
    """
    Data must be in a list of tuples with shape (n, 2).
    The index one of the tuple must be the review and the second the sentiment
    Measures the probability of each word given the class and returns a dataframe.
    The calculation follows the binary naive bayes algorithm and uses Laplace smoothing
    """
    out = pd.DataFrame()
    for i in data:
      if i[1] not in out.columns:
        out.insert(len(out.columns), i[1], 0)

      for w in list(dict.fromkeys(re.sub("[^\w\s]", "", i[0]).lower().split())):
        if w not in out.index:
          dic = {w: [0 for i in out.columns]}
          out.loc[w] = float(0)
        out[i[1]][w] += 1

    out += 1

    for c in out.columns:
      out[c] = out[c]/sum(out[c])

    return out #out.apply(np.log) si queremos hacer el logaritmo de un DataFrame

    def computeProb(self, testdata, log = True):
      probabilities = {}
      words = re.sub("[^\w\s]", "", testdata).lower().split()
      words = list(dict.fromkeys(words))
      if log:
        for c in self.data.columns:
          probabilities.update({c : 1})
          for w in words:
            if w in self.data.index:
              probabilities[c] += np.log(self.data[c][w])
          probabilities[c] += np.log(self.p_c[c])
      else:
        for c in self.data.columns:
          probabilities.update({c : 1})
          for w in words:
            if w in self.data.index:
              probabilities[c] *= self.data[c][w]
          probabilities[c] *= self.p_c[c]
      return probabilities

    out += 1

    for c in out.columns:
      out[c] = out[c]/sum(out[c])

    return out #out.apply(np.log) si queremos hacer el logaritmo de un DataFrame

class MovieReviewModel(BinaryNaiveBayes):
  """
  This is the main model for the movie review sentiment prediction.
  It inherits from the BinaryReviewModel which inherits from NaiveBayesModel
  The model requires a list of tuples as an input which should be of shape(n, 2)
  The test data must be on the same format.
  When the object is called the function "computeProb" is executed, which requires a single review as input and predicts its sentiment.
  """
  def __init__(self, data):
    super().__init__(data)
  def comparePredict(self, testdata):
    """
    Makes a prediction given a test data
    Compares the predicted system output data to the 'gold standard' data.
    Returns a dictionary with the recall, precision, and accuracy inside.
    """
    prediction = []
    for i in testdata:
      temp = self.computeProb(i[0])
      prediction.append((i[0], max(zip(temp.values(), temp.keys()))[1]))

    tp = 1
    tn = 1
    fp = 1
    fn = 1

    for i in range(len(prediction)):
      if prediction[i][1] == testdata[i][1]:
        if prediction[i][1] == "positive":
          tp+=1
        else:
          tn+=1
      else:
        if prediction[i][1] == "positive":
          fp+=1
        else:
          fn+=1
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    acc = (tp+tn)/(tp+tn+fp+fn)
    print(f"""
          True positives = {tp-1}
          True negatives = {tn-1}
          False positives = {fp-1}
          False negatives = {fn-1}
          Precision = {precision}
          Recall = {recall}
          Accuracy = {(acc)*100}%
    """)
    return {"precision" : precision, "recall" : recall, "accuracy" : acc}

#Exercises

In [None]:
#Exercise 4.2
data = [("fun, couple, love, love" , "comedy"),
        ("fun, furious, shoot", "action"),
        ("couple, fly, fast, fun, fun", "comedy"),
        ("furious, shoot, shoot, fun", "action"),
        ("fly, fast, shoot, love", "action")]

inp = "fast, couple, shoot, fly"

modelNormal = NaiveBayesModel(data)
modelNormal.debug()
print(modelNormal(inp))

         comedy    action
fun      0.2500  0.166667
couple   0.1875  0.055556
love     0.1875  0.111111
furious  0.0625  0.166667
shoot    0.0625  0.277778
fly      0.1250  0.111111
fast     0.1250  0.111111

{'comedy': 0.4, 'action': 0.6}
{'comedy': -8.521738971045279, 'action': -8.076580381796658}


In [None]:
model_binary = BinaryNaiveBayes(data)
model_binary.debug()
print(model_binary(inp))

           comedy    action
fun      0.214286  0.176471
couple   0.214286  0.058824
love     0.142857  0.117647
furious  0.071429  0.176471
shoot    0.071429  0.235294
fly      0.142857  0.117647
fast     0.142857  0.117647

{'comedy': 0.4, 'action': 0.6}
{'comedy': -7.98761340054719, 'action': -8.071090277751074}


In [None]:
#Exercise 4.3
data = [
    ("Good, good, good, great, great, great", "positive"),
    ("poor, great, great", "positive"),
    ("good, pOor, poor, poor", "negative"),
    ("good, poor, poor, poor, poor, poor, great, great", "negative"),
    ("poor, poor", "negative")
]

inp = "a good, good plot and great characters, but poor acting"

print("###### Naive bayes model ######")
modelNormal = NaiveBayesModel(data)
modelNormal.debug()
result = modelNormal(inp)
print(result)
print(f"There's a higher chance of in being {max(zip(result.values(), result.keys()))[1]}")
print("\n###### Binary naive bayes model ######")
model_binary = BinaryNaiveBayes(data)
model_binary.debug()
result = model_binary(inp)
print(result)
print(f"There's a higher chance of ii being {max(zip(result.values(), result.keys()))[1]}")

###### Naive bayes model ######
       positive  negative
good   0.333333  0.176471
great  0.500000  0.176471
poor   0.166667  0.647059

{'positive': 0.4, 'negative': 0.6}
{'positive': -4.598421958998375, 'negative': -5.149946861188155}
There's a higher chance of in being positive

###### Binary naive bayes model ######
       positive  negative
good   0.285714  0.333333
great  0.428571  0.222222
poor   0.285714  0.444444

{'positive': 0.4, 'negative': 0.6}
{'positive': -4.521877497747463, 'negative': -4.0230578140948134}
There's a higher chance of in being negative


#Sentiment model

Now I should test this model on a real dataset to see how it works with more data.

In [None]:
data = pd.read_csv("/content/IMDB.csv", index_col=False, header=0, encoding="UTF-8", on_bad_lines="skip", engine="python")

for i in range(len(data)):
  data["review"][i] = re.sub("<br />", " ", data["review"][i])
  data["review"][i] = re.sub("\"", "'", data["review"][i])

data = list(data.itertuples(index=False, name=None))

In [None]:
split_point = int(0.8 * len(data))
train_data, test_data = data[:split_point], data[split_point:]
model = MovieReviewModel(train_data[:3000])
model.debug()
model.comparePredict(test_data[:100])

               positive  negative
one            0.003280  0.003360
of             0.005825  0.005850
the            0.006103  0.006093
other          0.001654  0.001550
reviewers      0.000065  0.000058
...                 ...       ...
fiftyfive      0.000004  0.000008
enquanto       0.000004  0.000008
ela            0.000004  0.000008
fora           0.000004  0.000008
unrecommended  0.000004  0.000008

[36424 rows x 2 columns]

{'positive': 0.5026666666666667, 'negative': 0.49733333333333335}

          True positives = 38
          True negatives = 46
          False positives = 9
          False negatives = 7
          Precision = 0.7959183673469388
          Recall = 0.8297872340425532
          Accuracy = 82.6923076923077%
    
