In [18]:
import nltk
import string
import os
import copy
import numpy as np


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


import re
from random import sample, shuffle
from sklearn.cluster import KMeans, AgglomerativeClustering

from nltk import pos_tag
from nltk.corpus import stopwords, wordnet, words
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.tokenize import TweetTokenizer, word_tokenize
from sklearn.cluster import KMeans, AgglomerativeClustering

# for use in removing stop words
nltk.download('stopwords')

# required for pos tagging
nltk.download('averaged_perceptron_tagger')

# required for lemmatization
nltk.download('wordnet')
# required for wordnet
nltk.download('omw-1.4')

print(torch.cuda.device_count())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
stop_words = set(stopwords.words("english"))

stemmer = nltk.SnowballStemmer("english", ignore_stopwords = False)
# NB path AND corpus_after_token_reversal SHOULD BE CHANGED TO MATCH
# THE CORPUS PATH ON THE SPECIFIC MACHINE

# Folder path for the corpus
corpus_path = "drive/MyDrive/34711-Cwk-S-DeepLearning_Minjun/product_reviews"

# Folder path where the reverse token corpus should be stored
corpus_after_token_reversal = r"drive/MyDrive/34711-Cwk-S-DeepLearning_Minjun/processed_reviews"
file_pattern = r".*"
original_corpus = nltk.corpus.PlaintextCorpusReader(corpus_path, file_pattern)
print(original_corpus.fileids())

['Canon_PowerShot_SD500.txt', 'Canon_S100.txt', 'Diaper_Champ.txt', 'Hitachi_router.txt', 'Linksys_Router.txt', 'MicroMP3.txt', 'Nokia_6600.txt', 'ipod.txt', 'norton.txt']


In [21]:
# Core utility function for document cleaning
# Works recursively, split the text into sentences/review, then for each sentence/review perform cleaning 
def process_doc(text, remove_punctuation, case_fold, stem,
                remove_stopwords, remove_short_tokens, tokenize_by, manual_remove_list = [],
                remove_nonalphabetical = False):

  if (tokenize_by == "sentence"):
    sentences = nltk.RegexpTokenizer("##", gaps = True).tokenize(text)
    sentences = [process_doc(sentence, remove_punctuation, case_fold, stem, 
                             remove_stopwords, remove_short_tokens, "words", manual_remove_list) 
                  for sentence in sentences]
    return sentences
  if (tokenize_by == "reviews"):
    reviews = nltk.RegexpTokenizer("\[t\]", gaps = True).tokenize(text)
    reviews = [process_doc(review, remove_punctuation, case_fold, stem, 
                              remove_stopwords, remove_short_tokens, "words", manual_remove_list)
                for review in reviews]
    return reviews
  if (tokenize_by == "words"):
    words = nltk.TreebankWordTokenizer().tokenize(text)
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"
                and w != "'s"
                and w != "'m"
                and w != "'re"
                and w != "'ve"]
      words = [w.strip("") for w in words]
      words = [w.strip(".") for w in words]
    if (case_fold):
      words = [w.lower() for w in words]
    if (remove_short_tokens):
      words = [w for w in words if len(w) > 2]
    if (stem):
      words = [w if w in manual_remove_list else stemmer.stem(w) for w in words]
    if (remove_stopwords):
      words = [w for w in words if w not in stop_words and w != "n't"]
    if (remove_punctuation):
      words = [w for w in words if w not in string.punctuation and w != "..." and w != "]##"]
    if (remove_nonalphabetical):
      words = [w for w in words if w.isalpha()]
    return words

def process_corpus(corpus, remove_punctuation:bool, case_fold:bool, stem:bool,
                  remove_stopwords:bool, remove_short_tokens, tokenize_by:str, remove_nonalphabetical):
  docs = [word for fileid in corpus.fileids() 
            for word in process_doc(corpus.raw(fileid), remove_punctuation, case_fold,
                                    stem, remove_stopwords, remove_short_tokens, 
                                    tokenize_by, remove_nonalphabetical)
         ]
  return docs

def most_frequent(words, n, should_print):
  freqDist = nltk.FreqDist(words)
  most_common = freqDist.most_common(n)
  if (should_print):
    i = 1
    for (w, count) in most_common:
      print(i , w , count)
      i += 1
  return most_common

In [22]:
def get_all_sentences_cleaned(corpus_filepath):
  corpus = nltk.corpus.PlaintextCorpusReader(corpus_filepath, file_pattern)
  out = []
  for fileid in corpus.fileids():
    # removing some punctuation manually
    sentences = process_doc(corpus.raw(fileid), True, True, True, True, True,"sentence", ["§", "―","•","\t","←","→"], False)
    out.extend(sentences)
  return out

# partitions corpus into sentiments, and cleans the text
def clean_all_sentiments(corpus_filepath, stemming, stop_words, remove_shuffled_sentiments = False):
  corpus = nltk.corpus.PlaintextCorpusReader(corpus_filepath, file_pattern)


  # pattern used to match sentiments == words before ##
  # ex) sound[+2] ##
  pattern = re.compile(r"(([a-z -]*\[[\-\+][0-9]\],? ?)+#[^(\[)]+)")
  sentiments = []
  for file_id in corpus.fileids():
    text = corpus.raw(file_id)
    # [cs],[t] replaced into empty string
    text = re.sub("\[[a-z]+\]", "", text)
    text = pattern.findall(text)
    for sentiment in text:
      sentiment_parsed = sentiment[0]
      # Find all labels for whether a sentiment is positive or negative
      # ex) [+1],[-1]
      matches = re.findall("\[[\+\-][0-9]\]", sentiment_parsed)
      score = 0
      has_positive = False
      has_negative = False
      for match in matches:
        score += int(match[1:-1])
        if match[1] == "+":
          has_positive = True
        if match[1] == "-":
          has_negative = True
      # if the sum of all scores is 0 discard the sample, since we are doing binary 
      # classification, optionally remove all sentiments with mixed labels
      if remove_shuffled_sentiments and has_positive and has_negative:
        continue
      if (score == 0): continue
      if (score < 0): score = 0
      if (score > 0): score = 1
      sentiment_parsed = process_doc(sentiment_parsed, True, True, stemming, stop_words, True, "words", [], True)
      if (len(sentiment_parsed[:-1]) < 2): continue
      sentiments.append((sentiment_parsed[:-1], score))
  return sentiments

def generate_word_to_indx_and_idx_to_word(corpus):
  word_to_idx = {}
  idx_to_word = {}
  i = 0
  for sentence in corpus:
    for word in sentence[0]:
      if (word not in word_to_idx):
        word_to_idx[word] = i
        idx_to_word[i] = word
        i += 1
  return (word_to_idx, idx_to_word)

def get_context_window_tuples(word_to_idx, sentences, window, key_words):
  tuples = []
  for sentence in sentences:
    for i in range(window, len(sentence) - window):
      # if sentence[i] in key_words:
        context = []
        middle_word = word_to_idx[sentence[i]]
        for j in range (i - window, i + window + 1):
          if i != j:
            context.append(word_to_idx[sentence[j]])
        tuples.append((context, word_to_idx[sentence[i]]))
          
        
  return tuples


def get_skipgrams(sentiments, window):
  word = []
  context = []
  for sentiment in sentiments:
    sentence = sentiment[0]
    for i in range(len(sentence)):
      cont = [sentence[idx] for idx in range(max(0, i - window), min(len(sentence), i + window + 1)) if idx != i]
      word.extend([sentence[i]] * (len(cont)))
      context.extend(cont)
  return(word, context)     

def get_batches(words, contexts, batch_size):
  shuffled_idxs = sample(range(0, len(words)), len(words))
  batches = []

  batch_word, batch_context = [], []
  for i in range(len(words)):
    idx = shuffled_idxs[i]
    batch_word.append(words[idx])
    batch_context.append(contexts[idx])
    if (i + 1) % batch_size == 0 or i + 1 == len(words):
      batches.append((
        torch.from_numpy(np.array(batch_word)),
        torch.from_numpy(np.array(batch_context))
      ))
      batch_word, batch_context = [], []
  return batches
  
def get_x_tensors(x_y_tuples):
  tensors = []
  for tuple in x_y_tuples:
    tensors.append(torch.tensor(tuple[0], dtype=torch.long))
  return tensors


def get_y_tensors(tuples, num_classes):
  tensors = []
  
  for tuple in tuples:
    tensors.append(F.one_hot(torch.tensor(tuple[1]), num_classes=num_classes))
  return tensors

def get_sentiments_as_word_idxs(sentiments, word_to_idx):
  return [([word_to_idx[word] for word in words], label) for (words, label) in sentiments]

In [23]:
# Function to split data into K folds
def k_fold_partititoning(sentiments, k, DO_SHUFFLE):
  # shuffle
  # we do not shuffle when we need to compare the results of experiments
  if (DO_SHUFFLE):
    shuffle(sentiments)
  folds = []
  # determine fold size
  partition_step = len(sentiments) // k
  remainders = len(sentiments) % k
  start = 0
  # append to each fold
  for i in range(k):
    if (remainders > 0):
      folds.append(sentiments[start : start + partition_step + 1])
      start += partition_step + 1
      remainders -= 0
    else:
      folds.append(sentiments[start : start + partition_step])
      start += partition_step
  return folds

# partition the data into two - the i-th fold and the rest
def split_training_testing_from_k_folds(i, folds):
  # To ensure that not tampering is done 
  testing = copy.deepcopy(folds[i])
  
  training = []
  for j in range(i):
    training.extend(folds[j])
  for j in range(i + 1, len(folds)):
    training.extend(folds[j])
  return (training, testing)
  
def to_tensors(sentiments):
  shuffle(sentiments)
  start = 0
  batches = [(torch.from_numpy(np.array(sentiment[0])), torch.from_numpy(np.array(sentiment[1], dtype=float))) for sentiment in sentiments]
  return batches

# y_hat is a tensor output of a sigmoid (y_hat between: [0, 1])
def get_binary_accuracy(y_hat, y, verbose=False):
  # if y_hat <= 0.5: rounded = 0 else: rounded = 1
  rounded = torch.round(y_hat)
  correct = (rounded == y).float()
  if verbose:
    print("y_hat:", y_hat.data)
    print("y:", y.data)
    print("rounded:", rounded.data)
    print("correct: ", correct.data)
  return correct

In [24]:
class CNN(nn.Module):
    def __init__(self, vocab_size, FILTER_NUM, embed_dim = None, padding_idx = None, embedding_weights = None, dropout_rate = 0):
        
        super().__init__()
        
        if (embedding_weights != None):
          self.embedding = nn.Embedding.from_pretrained(embedding_weights, freeze=False)
          embed_dim = embedding_weights.size()[1]
        else:
          self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx = padding_idx)
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = FILTER_NUM, 
                                kernel_size = (3, embed_dim))
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = FILTER_NUM, 
                                kernel_size = (4, embed_dim))
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = FILTER_NUM, 
                                kernel_size = (5, embed_dim))
        
        self.fc = nn.Linear(3 * FILTER_NUM, 1)
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, text, training = False):
  
        embedding = self.embedding(text)
        #embedding = [len(text) x embedding_size] 

        embedding = embedding.unsqueeze(1)
        embedding = embedding.unsqueeze(1)
        embedding = embedding.permute(1, 2, 0, 3)

        conved_0 = F.relu(self.conv_0(embedding).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedding).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedding).squeeze(3))
        #conved_n = [len(text) - kernel_size x number of filters] 
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        concat = torch.cat((
            pooled_0, 
            pooled_1, 
            pooled_2)
          , dim = 1)
        # Apply dropout only when training
        if (training):
          concat = self.dropout(concat)
        
        #concat = [len(text) - kernel_size x number of filters] 
        return self.sigmoid(self.fc(concat))

In [25]:
def train_and_eval(FILTER_NUM, EMBEDDING_DIMENSION, epochs, batch_size, learning_rate, folds, vocab_size, padding_idx, dropout_rate, verbose):
  K = len(folds)
  # accuracies will contain the accuracies for each fold for each epoch
  accuracies = np.zeros((K, epochs))
  for k in range(K):
    if (verbose):
      print("FOLD: ", k + 1)
    (training_data, testing_data) = split_training_testing_from_k_folds(k, folds)
    model = CNN(vocab_size + 1, FILTER_NUM, EMBEDDING_DIMENSION, padding_idx, dropout_rate=dropout_rate)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), learning_rate)
    loss_fn = nn.BCELoss()
    loss_fn = loss_fn.to(device)
    for epoch in range(epochs):
      if (verbose):
        print("EPOCH:", epoch + 1)
      acc = 0
      total_loss = 0
      n = 0
      for sample in to_tensors(training_data):
        optimizer.zero_grad()
        (sentiment, label) = sample 
        sentiment = sentiment.to(device)
        label = label.to(device)
        y_hat = model(sentiment, training = True).squeeze()
        acc += get_binary_accuracy(y_hat, label, 1 == 0)
        loss = loss_fn(y_hat, label.float())
        total_loss += loss
        loss.backward()
        optimizer.step()
        n += 1
      if (verbose):
        print("TRAINING: accuracy", (acc / n).item(), "total loss", total_loss.item())
      with torch.no_grad():
        accuracy = 0
        for (sentiment, label) in to_tensors(testing_data):
          sentiment = sentiment.to(device)
          label = label.to(device)
          accuracy += get_binary_accuracy(model(sentiment), label)
        accuracies[k][epoch] = (accuracy / len(testing_data)).item()
        if (verbose):
          print("EPOCH VALIDATION ACCURACY", accuracy.item())
  
  # epoch averages contains the average accuracy for each epoch accross all folds
  epoch_averages = np.mean(accuracies, axis=0)
  best_epoch = np.argmax(epoch_averages)
  return "Best epoch:", best_epoch + 1, "with an average", epoch_averages[best_epoch]

In [26]:
def run_experiment(FILTER_NUM, EMBEDDING_DIMENSION, epochs, batch_size, learning_rate, stemming, stopword_removal, dropout, verbose, DO_SHUFFLE, remove_shuffled = False):
  # sentiments - a list of tuples, tuple[0] is the cleaned text of a sentiment, tuple[1] is {0, 1} represents if a sentiment is positive(1) or negative(0)
  sentiments = clean_all_sentiments(corpus_path, stemming, stopword_removal, remove_shuffled)
  (word_to_idx, idx_to_word) = generate_word_to_indx_and_idx_to_word(sentiments)
  # tuple[0] in sentiments becomes a list of ints, each int represents a token, word_to_idx, idx_to_word contain the mapping
  sentiments = get_sentiments_as_word_idxs(sentiments, word_to_idx)
  PADDING_STR = ""
  PADDING_IDX = len(word_to_idx)
  idx_to_word[PADDING_IDX] = PADDING_STR
  word_to_idx[PADDING_STR] = PADDING_IDX
  vocab_size = len(idx_to_word)
  # The filter size of the CNN is 5, all shorter texts than that need padding
  for sentiment in sentiments:
    while (len(sentiment[0]) < 5): 
      sentiment[0].append(PADDING_IDX)
  # 5-fold
  k_folds = k_fold_partititoning(sentiments, 5, DO_SHUFFLE)
  
  # training and evaluation
  return train_and_eval(FILTER_NUM, EMBEDDING_DIMENSION, epochs, batch_size, learning_rate, k_folds, vocab_size, PADDING_IDX, dropout, verbose)


In [27]:
print("Removing mixed sentiments", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 5, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.75, verbose = False, DO_SHUFFLE = True, remove_shuffled = True))

print("Keeping mixed sentiments", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 5, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.75, verbose = False, DO_SHUFFLE = False, remove_shuffled = False))


KeyboardInterrupt: ignored

Removing mixed sentiments ('Best epoch:', 5, 'with an average', 0.7034740567207336)<br>
Keeping mixed sentiments ('Best epoch:', 3, 'with an average', 0.6950494885444641)

In [None]:
print("After stemming", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=True, stopword_removal = False, dropout = 0.75, verbose = False, DO_SHUFFLE = False))

print("After removing stop words", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = True, dropout = 0.75, verbose = False, DO_SHUFFLE = False))

print("Without both stemming and removing stop words", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.75, verbose = False, DO_SHUFFLE = False))

print("After both stemming and removing stop words", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=True, stopword_removal = True, dropout = 0.75, verbose = False, DO_SHUFFLE = False))

After stemming ('Best epoch:', 14, 'with an average', 0.7024752378463746)<br>
After removing stop words ('Best epoch:', 5, 'with an average', 0.6800979614257813)<br>
Without both stemming and removing stop words ('Best epoch:', 15, 'with an average', 0.6920791983604431)<br>
After both stemming and removing stop words ('Best epoch:', 6, 'with an average', 0.6918968319892883)

In [28]:
print("Dropout 0", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0, verbose = False, DO_SHUFFLE = False))

print("Dropout 0.25", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.25, verbose = False, DO_SHUFFLE = False))

print("Dropout 0.50", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.50, verbose = False, DO_SHUFFLE = False))

print("Dropout 0.75", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.75, verbose = False, DO_SHUFFLE = False))

print("Dropout 0.85", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Dropout 0.95", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.95, verbose = False, DO_SHUFFLE = False))


Dropout 0 ('Best epoch:', 15, 'with an average', 0.706930673122406)
Dropout 0.25 ('Best epoch:', 8, 'with an average', 0.7123762130737304)
Dropout 0.50 ('Best epoch:', 7, 'with an average', 0.6935643553733826)
Dropout 0.75 ('Best epoch:', 8, 'with an average', 0.7089108824729919)
Dropout 0.85 ('Best epoch:', 9, 'with an average', 0.6970296978950501)
Dropout 0.95 ('Best epoch:', 11, 'with an average', 0.6707920789718628)


Dropout 0 ('Best epoch:', 19, 'with an average', 0.7014851331710815)<br>
Dropout 0.25 ('Best epoch:', 3, 'with an average', 0.6915841460227966)<br>
Dropout 0.50 ('Best epoch:', 9, 'with an average', 0.7064356327056884)<br>
Dropout 0.75 ('Best epoch:', 16, 'with an average', 0.6876237630844116)<br>
Dropout 0.85 ('Best epoch:', 12, 'with an average', 0.6955445528030395)<br>
Dropout 0.95 ('Best epoch:', 7, 'with an average', 0.6594059348106385)

In [None]:
# Running for only 15 epochs to speed up the experiment 
print("Filter number 50:", run_experiment(FILTER_NUM = 50, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Filter number 100", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Filter number 200", run_experiment(FILTER_NUM = 200, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Filter number 300", run_experiment(FILTER_NUM = 300, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Filter number 400", run_experiment(FILTER_NUM = 400, EMBEDDING_DIMENSION = 300, epochs = 15, batch_size = 150, 
                                      learning_rate=0.001, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))


Filter number 50: ('Best epoch:', 9, 'with an average', 0.707920777797699)<br>
Filter number 100 ('Best epoch:', 12, 'with an average', 0.6925742506980896)<br>
Filter number 200 ('Best epoch:', 11, 'with an average', 0.6960396051406861)<br>
Filter number 300 ('Best epoch:', 15, 'with an average', 0.6792079091072083)<br>
Filter number 400 ('Best epoch:', 7, 'with an average', 0.662871265411377)<br>

In [None]:
print("Embedding size 50", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 50, epochs = 20, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Embedding size 100", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 100, epochs = 20, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))

print("Embedding size 200", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 200, epochs = 20, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))
 
print("Embedding size 300", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 20, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))
  
print("Embedding size 400", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 400, epochs = 20, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = False))


Embedding size 50 ('Best epoch:', 15, 'with an average', 0.701980185508728)<br>
Embedding size 100 ('Best epoch:', 10, 'with an average', 0.7064356327056884)<br>
Embedding size 200 ('Best epoch:', 14, 'with an average', 0.7059405684471131)<br>
Embedding size 300 ('Best epoch:', 10, 'with an average', 0.6995049476623535)<br>
Embedding size 400 ('Best epoch:', 19, 'with an average', 0.7089108824729919)<br>

In [None]:
print("Accuracy with best parameters:", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 30, batch_size = 150, 
                                      learning_rate=0.0005, stemming=False, stopword_removal = False, dropout = 0.85, verbose = False, DO_SHUFFLE = True, remove_shuffled = True))


Accuracy with best parameters: ('Best epoch:', 8, 'with an average', 0.7395743727684021)

In [None]:
print("Accuracy with best parameters:", run_experiment(FILTER_NUM = 100, EMBEDDING_DIMENSION = 300, epochs = 30, batch_size = 150, 
                                      learning_rate=0.0005, stemming=True, stopword_removal = True, dropout = 0.85, verbose = False, DO_SHUFFLE = True, remove_shuffled = True))

Accuracy with best parameters and pre-proccessing: ('Best epoch:', 9, 'with an average', 0.7256854057312012)