<a href="https://colab.research.google.com/github/LinusBach/SentimentAnalysis/blob/main/sentiAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple sentiment analysis

Sentiment analysis, using iMDB database

First, implement and train a feedforward NN model with TF-IDF. And then train your
model using word2vec embedding. Report both training and development accuracy on
the dataset. Try to use stochastic gradient descent or (mini-batch) stochastic gradient
descent!



### imports and constants


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow import keras
import nltk
import csv
import numpy as np
# from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec

BATCH_SIZE = 32
EPOCHS = 20
# minimum ocuurences for a word to be regarded
VOCAB_SIZE = 5000
# number of most frequent words to be disregarded
HIGHER_CUTOFF = 20

2023-05-09 17:39:45.244112: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-09 17:39:45.281476: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-09 17:39:45.282258: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### load dataset into memory
return a list of docs and a list of respective labels

In [3]:
def load_data (filename):
  content = list()
  labels = list()

  has_header = True
  # detect if file has a header
  # with open(filename, 'r') as file:
  #   sample = file.read(64)
  #   has_header = csv.Sniffer().has_header(sample)

  with open(filename, 'r') as file:
    reader = csv.reader(file)
    # skip first line if file has a header 
    if has_header:
      next(reader)
    for c, l in reader:
      content.append(c)
      labels.append(l)
  return content, labels

### turn a dataset into clean tokens

In [19]:
def clean_data(data):
  corpus = list()
  corp_voc = dict()
  # regex tokenizer, find words, numbers and words containing '
  tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+(?:'\w)?")
  for doc in data:
    doc = tokenizer.tokenize(doc)
    doc_cleaned = dict()
    for tok in doc:
      # make all words lower case
      tok = tok.lower()
      # filter out numbers 
      if not tok.isdigit():
        # add clean token to document
        if tok in doc_cleaned:
          doc_cleaned[tok] += 1
        else:
          doc_cleaned[tok] = 1
    for tok in doc_cleaned.keys():
    # increase corpus vocabulary
        if tok in corp_voc:
          corp_voc[tok] += 1
        else:
          corp_voc[tok] = 1
    corpus.append(doc_cleaned)
  return corpus, corp_voc

# filter all words out of a corpus that are not in a vocabulary
def get_filtered_corpus(corpus, vocab):
  clean_corpus = list()
  for doc in corpus:
    clean_doc = dict()
    for tok in doc:
      if tok in vocab:
        clean_doc[tok] = doc[tok]
    clean_corpus.append(clean_doc)
  return clean_corpus


### preprocess the dataset

some naive implementations, way to slow tho

In [16]:

# corpus must be a list of dicts of form (token: occurences)
# vocab must be a dict of form (token: documents in corpus containing token)
def preprocess_tf_idf(corpus, vocab):
  processed = np.zeros((len(corpus), len(vocab)))
  idf = get_idf(vocab)
  token_order = {tok: i for i, tok, in enumerate(sorted(vocab.keys()))}
  for n_doc, doc in enumerate(corpus):
    tf = get_tf(doc)
    for tok in set(doc):
      tok_pos = token_order[tok]
      processed[n_doc][tok_pos] = tf[tok]*idf[tok]
  return processed

def get_tf(doc):
  tf = dict()
  for tok, occ in doc.items():
    tf[tok] = occ / len(doc)
  return tf

def get_idf(corp_voc):
  idf = dict()
  for tok, docs_containing in corp_voc.items():
    idf[tok] = np.log10(len(corp_voc) / docs_containing)
  return idf


### define the model


In [7]:
def define_model(input_dim):
  model = keras.models.Sequential()
  model.add(layers.Dense(511, input_dim=input_dim, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(255, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(127, activation='relu'))
  model.add(layers.Dropout(0.2))
  model.add(layers.Dense(2, activation='softmax'))
  return model

### classify a review as negative or positive.

In [8]:
def predict_sentiment(model, doc):
  return model.predict(doc)

### run

In [9]:
raw_data, labels = load_data("Train.csv")
full_corpus, full_vocab = clean_data(raw_data)

In [10]:
frequencies = sorted(full_vocab.items(), key=lambda x : x[1], reverse=True)
vocab = {x[0] : x[1] for x in frequencies[HIGHER_CUTOFF:HIGHER_CUTOFF+VOCAB_SIZE]}

corpus = get_filtered_corpus(full_corpus, vocab.keys())
print(corpus[0])

{'grew': 1, 'up': 1, 'b': 1, 'watching': 1, 'loving': 1, 'all': 2, 'my': 2, 'at': 1, 'school': 3, 'watched': 1, 'we': 2, 'played': 1, 'before': 1, 'during': 1, 'after': 1, 'wanted': 2, 'be': 3, 'or': 1, 'scott': 1, 'no': 1, 'one': 2, 'alan': 1, 'down': 1, 'from': 1, 'became': 1, 'an': 1, 'art': 1, 'form': 1, 'took': 1, 'children': 1, 'see': 1, 'hoping': 1, 'they': 1, 'would': 1, 'get': 1, 'glimpse': 1, 'what': 1, 'loved': 1, 'child': 1, 'how': 1, 'disappointing': 1, 'only': 1, 'high': 1, 'point': 1, 'theme': 1, 'tune': 1, 'could': 1, 'compare': 1, 'original': 1, 'score': 1, 'thankfully': 1, 'early': 1, 'saturday': 1, 'television': 1, 'channel': 1, 'still': 1, 'plays': 1, 'series': 1, 'anderson': 1, 'his': 3, 'wife': 1, 'created': 1, 'should': 1, 'hand': 1, 'directors': 1, 'chair': 1, 'version': 1, 'completely': 1, 'hopeless': 1, 'waste': 1, 'film': 1, 'utter': 1, 'rubbish': 1, 'cgi': 1, 'remake': 1, 'may': 1, 'acceptable': 1, 'huge': 1}


In [17]:
data = preprocess_tf_idf(corpus, vocab)
one_hot_labels = keras.utils.to_categorical(labels, num_classes=2)

In [20]:
v_raw, v_labels = load_data("Valid.csv")
full_valid_corpus, _ = clean_data(v_raw)
valid_corpus = get_filtered_corpus(full_valid_corpus, vocab)
valid_data = preprocess_tf_idf(valid_corpus, vocab)
valid_labels = keras.utils.to_categorical(v_labels, num_classes=2)

In [21]:
t_raw, t_labels = load_data("Test.csv")
full_test_corpus, _ = clean_data(t_raw)
test_corpus = get_filtered_corpus(full_test_corpus, vocab)
test_data = preprocess_tf_idf(test_corpus, vocab)
test_labels = keras.utils.to_categorical(t_labels, num_classes=2)

In [22]:
model = define_model(VOCAB_SIZE,)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
history = model.fit(
    data,
    one_hot_labels,
    validation_data=(valid_data, valid_labels),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
pred = model.evaluate(test_data, test_labels)



In [None]:
model.compute_loss()


1.1920928955078126e-11
