# **HW2 - Text Classification**

## 1. IMDB Dataset

In [1]:
from keras.datasets import imdb
import keras
(x_train, y_train), (x_test, y_test) = imdb.load_data()
word_index = keras.datasets.imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
inverted_word_index = dict((i, word) for (word, i) in word_index.items())

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


## 2. Pre-Processing

### 2.1. Any data cleaning

In [2]:
indexed_tokens_count = len(inverted_word_index.keys())
## We will use "indexed_tokens_count + 1" as an index for unknown words
inverted_word_index[indexed_tokens_count + 1] = '<UNK>'

In [3]:
## with adding <UNK> we should clean data and substitute unknown tokens with index <UNK>
def clean_unknowns(samples, idx_word_dict):
  known_idx = idx_word_dict.keys()
  for i, sample in enumerate(samples):
    for j, tok in enumerate(sample):
      if not tok in known_idx:
        samples[i][j] = indexed_tokens_count + 1

In [4]:
## Clean train data
clean_unknowns(x_train, inverted_word_index)
## Clean Test data
clean_unknowns(x_test, inverted_word_index)

### 2.2. Tokenization

In [5]:
def text_tokenization(samples, inverted_word_index):
  tokenized_samples = [None] * len(samples)
  for i, sample in enumerate(samples):
    tokenized_samples[i] = []
    for tok in sample:
      tokenized_samples[i].append(inverted_word_index[tok])
  
  return tokenized_samples

In [6]:
tokenized_train = text_tokenization(x_train, inverted_word_index)
tokenized_test = text_tokenization(x_test, inverted_word_index)

### 2.3. Stemming

In [7]:
from nltk import SnowballStemmer

def text_stemmer(samples):
  stemmer = SnowballStemmer("english")
  stemmed_samples = [None] * len(samples)
  for i, sample in enumerate(samples):
    stemmed_samples[i] = []
    for tok in sample:
      stemmed_samples[i].append(stemmer.stem(tok))
  
  return stemmed_samples

In [8]:
stemmed_train = text_stemmer(tokenized_train)
stemmed_test = text_stemmer(tokenized_test)

## 3. Build Models

### 3.1. Uni-Gram

In [16]:
import numpy as np
from collections import Counter
import nltk
from nltk.util import ngrams
import math

class Ngram_Model:

  def __init__(self, x, y, class_count, n_gram):
    self.n_gram = n_gram
    ## we give each class an index
    self.classes = [i for i in range(class_count)]
    ## split samples based on the class they belong to it
    self.class_samples_idx = [np.where(y == c)[0] for c in self.classes]
    ## find n_grams
    x_n_grams = self.find_n_grams(x)
    ## find n_grams that belong to each class
    self.x_grams = [[x_n_grams[idx] for idx in list(c)] for c in self.class_samples_idx]
    ## find probablity of each class
    self.class_prob = [len(c) / len(x_n_grams) for c in self.class_samples_idx]
    self.class_text = [[l for idx in list(c) for l in x_n_grams[idx]]for c in self.class_samples_idx]
    ## find vocabs of classes and freq
    self.vocab = [set(t) for t in self.class_text]
    self.vocab = {t for v in self.vocab for t in v}
    self.vocab_count = len(self.vocab)
    self.class_all_words_count = [len(c) for c in self.class_text]
    self.class_word_count = []
    for c in self.classes:
      self.class_word_count.append(dict())
      counter = Counter(self.class_text[c])
      for w in self.vocab:
        self.class_word_count[c][w] = counter[w]
  
  def find_n_grams(self, data):

    ## Unigram
    if self.n_gram == 1:
      return data

    ## Bigram
    elif self.n_gram == 2:
      new_data = []
      for l in data:
        new_data.append(list(nltk.bigrams(["<s>"] + l + ["</s>"])))
      return new_data

    ## Trigram
    elif self.n_gram == 3:
      new_data = []
      for l in data:
        new_data.append(list(ngrams(["<s>"] + l + ["</s>"], 3)))
      return new_data

  #add-1 smmothing / unknown word
  def compute_word_prob(self, word, c): 
    word_count = 0 
    if word in self.class_word_count[c]:
      word_count = self.class_word_count[c][word]
    prob = (word_count + 1) / (self.class_all_words_count[c] + self.vocab_count + 1)
    return prob

  def find_probs(self, test_data):
    probs = []
    for c in self.classes:
      prob_log = math.log(self.class_prob[c])
      for word in test_data:
        word_prob = self.compute_word_prob(word, c)
        prob_log += math.log(word_prob)
      probs.append(prob_log)
    return probs

  
  def predict(self, test_set):
    test_n_gram = self.find_n_grams(test_set)
    pred = []
    for test_data in test_n_gram:
      pred.append(np.argmax(self.find_probs(test_data)))
    return pred

### 3.2. Bi-Gram


In [17]:
classes_count = 2 # positive and negative
n_gram = 2
bigram_model = Ngram_Model(stemmed_train, y_train, classes_count, n_gram)

### 3.3. Tri-Gram

In [18]:
classes_count = 2 # positive and negative
n_gram = 3
trigram_model = Ngram_Model(stemmed_train, y_train, classes_count, n_gram)

## 4. Evaluate Model

In [19]:
def evaluate_model(true_y, pred_y):
  n = set(np.where(true_y == 0)[0])  
  p = set(np.where(true_y == 1)[0])  
  p_pred = set(np.where(pred_y == 1)[0])
  n_pred = set(np.where(pred_y == 0)[0])

  tp = p & p_pred 
  tn = n & n_pred 
  fp = n & p_pred 
  fn = p & n_pred  
  
  accuracy = (len(tp) + len(tn)) / (len(n) + len(p))
  precision = len(tp) / (len(tp) + len(fp))
  recall = len(tp) / (len(tp) + len(fn))
  f1_score = 2 * precision * recall / (precision + recall)

  return accuracy, precision, recall, f1_score, tp, tn

**Unigram**

In [24]:
model_1 = Ngram_Model(stemmed_train, y_train,2, 1)
preds_1 = np.array(model_1.predict(stemmed_test))
accuracy, precision, recall, f1_score, tp, tn = evaluate_model(y_test, preds)
print(f"\tAccuracy: {accuracy}")
print(f"\tPrecision: {precision}")
print(f"\tRecall: {recall}")
print(f"\tf1-score: {f1_score}\n")

	Accuracy: 0.87816
	Precision: 0.9157431838170624
	Recall: 0.83296
	f1-score: 0.8723921240050272



**Bigram**

In [25]:
preds_2 = np.array(bigram_model.predict(stemmed_test))
accuracy, precision, recall, f1_score, tp, tn = evaluate_model(y_test, preds)
print(f"\tAccuracy: {accuracy}")
print(f"\tPrecision: {precision}")
print(f"\tRecall: {recall}")
print(f"\tf1-score: {f1_score}\n")

	Accuracy: 0.87816
	Precision: 0.9157431838170624
	Recall: 0.83296
	f1-score: 0.8723921240050272



**Trigram**

In [26]:
preds_3 = np.array(trigram_model.predict(stemmed_test))
accuracy, precision, recall, f1_score, tp, tn = evaluate_model(y_test, preds)
print(f"\tAccuracy: {accuracy}")
print(f"\tPrecision: {precision}")
print(f"\tRecall: {recall}")
print(f"\tf1-score: {f1_score}\n")

	Accuracy: 0.87816
	Precision: 0.9157431838170624
	Recall: 0.83296
	f1-score: 0.8723921240050272



In [28]:
idx = 0
count = 6
for p1, p2, p3, p in zip(preds_1, preds_2, preds_3, y_test):
  if ((p1 & p2 & p3) == 0) and (not (((not p1) & (not p2) & (not p3)) == 1)):
    print('Text:')
    print(' '.join(tokenized_test[idx]))
    print(f'True class: {p}')
    print(f'Unigram class: {p1}')
    print(f'Bigram class: {p2}')
    print(f'Trigram model: {p3}')
    print(" ")
    count -= 1
    if count == 0:
      break
    idx +=1
    continue
  else:
    idx +=1

Text:
the watching boy couch as on interesting never aunt an like did as on real reception badly to shiny of purchased but that eyed average one in exploitation that them final realistic taxi but shock was does dvd to shock this as on off is very together to was fantastic scares some such badly victims maybe as on are year it's are unknown this factor assured they there's was fantastic life think taxi as it is alexander very on to real at life who an of production this of actually believes then also in can that to was two from real that real they there's at maybe those are of journey as on thing met is 8 walters that fairly of now 10 watching any years as on into at are year
True class: 1
Unigram class: 0
Bigram class: 0
Trigram model: 1
 
Text:
the of unger animation underproduced male it pressured in miracles' explanation feat male take no commodity damsel psyche risk this kill in exploitation is vhs fred in of peak be male it mentally who miracles' male watch is popular catch know c

## Good Luck!