<a href="https://colab.research.google.com/github/LinusBach/SentimentAnalysis/blob/main/sentiAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple sentiment analysis

Sentiment analysis, using iMDB database

First, implement and train a feedforward NN model with TF-IDF. And then train your
model using word2vec embedding. Report both training and development accuracy on
the dataset. Try to use stochastic gradient descent or (mini-batch) stochastic gradient
descent!



In [None]:
!cp drive/MyDrive/IMDB.zip sample_data/
!unzip sample_data/IMDB.zip

Archive:  sample_data/IMDB.zip
  inflating: Test.csv                
  inflating: Train.csv               
  inflating: Valid.csv               


### imports and constants


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow import keras
import nltk
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec

BATCH_SIZE = 32
EPOCHS = 20
# minimum ocuurences for a word to be regarded
VOCAB_SIZE = 5000
# number of most frequent words to be disregarded
HIGHER_CUTOFF = 20

### load dataset into memory
return a list of docs and a list of respective labels

In [None]:
def load_data (filename):
  content = list()
  labels = list()

  has_header = True
  # detect if file has a header
  # with open(filename, 'r') as file:
  #   sample = file.read(64)
  #   has_header = csv.Sniffer().has_header(sample)

  with open(filename, 'r') as file:
    reader = csv.reader(file)
    # skip first line if file has a header 
    if has_header:
      next(reader)
    for c, l in reader:
      content.append(c)
      labels.append(l)
  return content, labels

### turn a dataset into clean tokens

In [None]:
def clean_data(data):
  corpus = list()
  corp_voc = dict()
  # regex tokenizer, find words, numbers and words containing '
  tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+(?:'\w)?")
  for doc in data:
    doc = tokenizer.tokenize(doc)
    doc_cleaned = dict()
    for tok in doc:
      # make all words lower case
      tok = tok.lower()
      # filter out numbers 
      if not tok.isdigit():
        # add clean token to document
        if tok in doc_cleaned:
          doc_cleaned[tok] += 1
        else:
          doc_cleaned[tok] = 1
        # add clean token to corpus vocabulary
        if tok in corp_voc:
          corp_voc[tok] += 1
        else:
          corp_voc[tok] = 1
    corpus.append(doc_cleaned)
  return corpus, corp_voc

# filter all words out of a corpus that are not in a vocabulary
def get_filtered_corpus(corpus, vocab):
  vocab = set(vocab)
  clean_corpus = list()
  for doc in corpus:
    clean_doc = dict()
    for tok in doc:
      if tok in vocab:
        clean_doc[tok] = doc[tok]
    clean_corpus.append(clean_doc)
  return clean_corpus


### preprocess the dataset

some naive implementations, way to slow tho

In [None]:
def get_tf(doc):
  return {term : occ / len(doc) for term, occ in doc.items()}

def get_idf(document, corpus):
  {term : np.log10(len(corpus) / (sum(term in doc for doc in corpus) + 1)) for term in document.keys()}
  idf = dict()
  for n, term in enumerate(document):
    if n%100 == 0:
      print(f"term {n} out of {len(document)}")
    docs_containing = sum(term in doc for doc in corpus)
    idf[term] = np.log10(len(corpus) / (docs_containing + 1))
  return idf

def preprocess_tf_idf(corpus, vocab):
  processed = np.zeros((len(corpus), len(vocab)))
  idf = get_idf(vocab, corpus)
  for n_doc, doc in enumerate(corpus):
    print(f"document {n_doc} out of {len(corpus)}")
    tf = get_tf(doc)
    for tok in set(doc):
      tok_pos = vocab.index(tok)
      processed[n_doc][tok_pos] = tf[tok]*idf[tok]
  return processed

### define the model


In [None]:
def define_model(input_dim):
  model = keras.models.Sequential()
  model.add(layers.Dense(511, input_dim=input_dim, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(255, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(127, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(2, activation='softmax'))
  return model

### classify a review as negative or positive.

In [None]:
def predict_sentiment(model, doc):
  return model.predict(doc)

### run

In [None]:
raw_data, labels = load_data("Train.csv")
full_corpus, full_vocab = clean_data(raw_data)

In [None]:
frequencies = sorted(full_vocab.items(), key=lambda x : x[1], reverse=True)
vocab = {x[0] : x[1] for x in frequencies[HIGHER_CUTOFF:HIGHER_CUTOFF+VOCAB_SIZE]}

corpus = get_filtered_corpus(full_corpus, vocab.keys())
print(corpus[0])

{'i': 3, 'grew': 1, 'up': 1, 'b': 1, 'watching': 1, 'and': 3, 'loving': 1, 'the': 7, 'thunderbirds': 3, 'all': 2, 'my': 2, 'mates': 1, 'at': 1, 'school': 3, 'watched': 1, 'we': 2, 'played': 1, 'before': 1, 'during': 1, 'lunch': 1, 'after': 1, 'wanted': 2, 'to': 3, 'be': 3, 'virgil': 1, 'or': 1, 'scott': 1, 'no': 1, 'one': 2, 'alan': 1, 'counting': 1, 'down': 1, 'from': 1, 'became': 1, 'an': 1, 'art': 1, 'form': 1, 'took': 1, 'children': 1, 'see': 1, 'movie': 1, 'hoping': 1, 'they': 1, 'would': 1, 'get': 1, 'a': 5, 'glimpse': 1, 'of': 5, 'what': 1, 'loved': 1, 'as': 1, 'child': 1, 'how': 1, 'bitterly': 1, 'disappointing': 1, 'only': 1, 'high': 1, 'point': 1, 'was': 3, 'snappy': 1, 'theme': 1, 'tune': 1, 'not': 1, 'that': 1, 'it': 1, 'could': 1, 'compare': 1, 'with': 2, 'original': 1, 'score': 1, 'thankfully': 1, 'early': 1, 'saturday': 1, 'mornings': 1, 'television': 1, 'channel': 1, 'still': 1, 'plays': 1, 'reruns': 1, 'series': 1, 'gerry': 1, 'anderson': 1, 'his': 3, 'wife': 1, 'creat

In [None]:
data = preprocess_tf_idf(corpus, sorted(vocab.keys()))
one_hot_labels = keras.utils.to_categorical(labels, num_classes=2)

In [None]:
v_raw, v_labels = load_data("Valid.csv")
full_valid_corpus, _ = clean_data(v_raw)
valid_corpus = get_filtered_corpus(full_valid_corpus, vocab.keys())
valid_data = preprocess_tf_idf(valid_corpus, sorted(vocab.keys()))
valid_labels = keras.utils.to_categorical(v_labels, num_classes=2)

In [None]:
t_raw, t_labels = load_data("Test.csv")
full_test_corpus, _ = clean_data(t_raw)
test_corpus = get_filtered_corpus(full_test_corpus, vocab.keys())
test_data = preprocess_tf_idf(test_corpus, sorted(vocab.keys()))
test_labels = keras.utils.to_categorical(t_labels, num_classes=2)

In [None]:
model = define_model(VOCAB_SIZE,)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
history = model.fit(
    data,
    one_hot_labels,
    validation_data=(valid_data, valid_labels),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
pred = model.evaluate(test_data, test_labels)



In [None]:
model.compute_loss()


1.1920928955078126e-11
