In [None]:
import tqdm
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.datasets import TextClassificationDataset
from torchtext.data import Iterator
from torchtext.vocab import Vocab
from collections import Counter
from itertools import chain

In [None]:
import sys
import os
import json
# regex library
import re


class TweetProcessor(object):
    """
    pre-process and clean the tweets (works for training and test data)
    source: https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529ehttps://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e
    """

    def __init__(self, source, dest, source2=None):
        """
        case 1: source = train_pos, source2= train_neg
        case 2: source = test_data
        """
        self.stopwords = [str(line).replace("\n", "") for line in open("../data/stopwords.txt").readlines()]
        self.dictionary = json.load(open("../data/data.json"))
        self.source = open(source, "r")
        if source2 is not None:
            self.neg = open(source2, "r")
        else:
            self.neg = None
        self.dest = open(dest, "w+")

    def handle_emojis(self, tweet):
        for positive in self.dictionary["POS_EMOJI"]:
            regex = positive["regex"]
            tweet = re.sub(regex, positive["replacement"], tweet)
        for negative in self.dictionary["NEG_EMOJI"]:
            regex = negative["regex"]
            tweet = re.sub(regex, negative["replacement"], tweet)
        return tweet

    def preprocess_word(self, word):
        # remove punctuation
        word = word.strip('\'"?!,.():;')
        for entry in self.dictionary["WORD_CLEANING"]:
            word = re.sub(entry["regex"], entry["replacement"], word)
        return word

    def preprocess_tweet(self, tweet):
        for entry in self.dictionary["TWEET_CLEANING"]:
            tweet = re.sub(entry["regex"], entry["replacement"], tweet)
        return tweet

    def is_valid_word(self, word):
        for validity in self.dictionary["WORD_VALIDITY"]:
            if re.search(validity["regex"], word) is not None:
                return True
        return False

    def clean(self, tweet):
        processed_tweet = []
        # preprocess tweet as a whole
        tweet = self.preprocess_tweet(tweet)
        # strip space
        tweet = tweet.strip(' "\'')
        # replace emojis with either POS_EMOJI or NEG_EMOJI
        tweet = self.handle_emojis(tweet)
        words = tweet.split()
        for word in words:
            # pre-process word
            word = self.preprocess_word(word)
            if self.is_valid_word(word) and word not in self.stopwords:
                processed_tweet.append(word)
        # return tweet as string
        return ' '.join(processed_tweet)

    def preprocess(self):
        # distinguish case
        if self.neg is None:
            # Test
            files = [self.source]
        else:
            # Train
            files = [self.source, self.neg]
        tweet_id = 1
        for file in files:
            for tweet in file.readlines():
                tweet = self.clean(tweet)

                # write to dest
                if self.neg is not None:  # write labels
                    if file is self.source:  # label pos = 1
                        self.dest.write(str(tweet_id) + ',' + tweet.replace('\n', '') + ',' + str(1) + '\n')
                    else:  # label neg = 0
                        self.dest.write(str(tweet_id) + ',' + tweet.replace('\n', '') + ',' + str(0) + '\n')
                else:  # do not write labels
                    self.dest.write(str(tweet_id) + ',' + tweet.replace('\n', '') + '\n')
                # increment id
                tweet_id += 1
            # close file
            file.close()
        # close dest file
        print('\nSaved processed tweets to: %s' % str(self.dest.name))
        self.dest.close()


In [None]:
train_pos_full = "../data/train_pos.txt"
train_neg_full = "../data/train_neg.txt"
train_processed_file_name = "../data/train_preprocessed.txt"
train_cleaned = TweetProcessor(train_pos_full, train_processed_file_name, train_neg_full)
train_cleaned.preprocess()

In [None]:
with open("../data/train_preprocessed.txt") as f:
    data_list = f.readlines()

data_list = [x.strip().split(',')[1:] for x in data_list]

data_pos = [x[0] for x in data_list if x[1] == '1']
data_neg = [x[0] for x in data_list if x[1] == '1']

data_vocab = [x[0].split() for x in data_list]

In [None]:

vocab = Vocab(Counter(list(chain.from_iterable(data_vocab))))

vocab_size = len(vocab)

temp = data_vocab[0][0]
bigram_list = []

for sent in data_vocab:
    for word in sent:
        bigram_list.append(temp + ' ' + word)
        temp = word

bigram_vocab = Vocab(Counter(bigram_list[1:]))
bigram_data = []
for inst in data_list:
     sent = inst[0].split()
     sent = [x + ' ' + y for x,y in zip(sent[:-1],sent[1:])]
     label = int(inst[1])
     tokens = [bigram_vocab.stoi[x] for x in sent]
     bigram_data.append((label,tokens))


In [None]:
print(bigram_data[:10])

In [None]:
dataset_data = []
for inst in data_list:
     sent = inst[0].split()
     label = int(inst[1])
     tokens = [vocab.stoi[x] for x in sent]
     dataset_data.append((label,tokens))

In [None]:
print(dataset_data[:10])

In [None]:
labels = {1,0}
size = len(dataset_data)
split = int(size*0.90)
print(size,split)
train_data = dataset_data[:split]
valid_data = dataset_data[split:]
print(len(train_data),len(valid_data))
random.shuffle(train_data)
train_dataset = TextClassificationDataset(vocab,train_data,labels)
valid_dataset = TextClassificationDataset(vocab,valid_data,labels)

In [None]:
labels = {1,0}
size = len(bigram_data)
split = int(size*0.90)
print(size,split)
bigram_train_data = bigram_data[:split]
bigram_valid_data = bigram_data[split:]
print(len(bigram_train_data),len(bigram_valid_data))
random.shuffle(bigram_train_data)
bigram_train = TextClassificationDataset(bigram_vocab,bigram_train_data,labels)
bigram_valid = TextClassificationDataset(bigram_vocab,bigram_valid_data,labels)

In [None]:
def get_pred(x):
    if x > .5:
        return 1
    else:
        return 0

def get_pred_arr(arr):
    return [get_pred(x) for x in arr]

In [None]:
vocab_size = len(vocab)
dim1 = 80
dim2 = 40
dim3 = 20
dropout = 0.3

class Classifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.sig = nn.Sigmoid()
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets = None):
        embedded = self.embedding(text, offsets)
        return self.sig(self.fc(embedded))

model = Classifier(vocab_size,dim1,1)
loss_func = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

num_epochs = 10
batch_size = 10
num_batches = len(train_dataset)//batch_size


def generate_batch(batch):

    label = torch.tensor([entry[0] for entry in batch if len(entry[1]) != 0],dtype=torch.float)
    text = [torch.tensor(entry[1]) for entry in batch if len(entry[1]) != 0]

    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    if text == []:
        return None, None, None
    text = torch.cat(text)
    return text, offsets, label

def train_model(model, loss_func, optimizer, train_dataset, valid_dataset, batch_size = 10, epochs = 10):
    num_batches = len(train_dataset)//batch_size
    for epoch in range(num_epochs):
        epoch_loss = 0
        
        for i in range(num_batches-1):
            
            
            text, offsets, labels = generate_batch(train_dataset[i*batch_size:(i+1)*batch_size])
            if text == None:
                continue
            optimizer.zero_grad()
            outputs = model(text,offsets)

            loss = loss_func(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss = loss.item()
            if i % 1000 == 0:
                print("Epoch: ", epoch, " batch: ", i, " num_batches: ", num_batches, " loss: ", loss.item())
        

        text, offsets, labels = generate_batch(valid_dataset)

        outputs = model(text,offsets)
        np_outs = outputs.detach().numpy()
        preds = get_pred_arr(np_outs)

        precision = sum([x == y for x,y in zip(preds,labels.detach().numpy())])/len(preds)       

        print("Epoch: %d, loss = %.3f, precision = %.3f" % (epoch, epoch_loss, precision))

    return model

train_model(model, loss_func, optimizer, train_dataset, valid_dataset)

model2 = Classifier(bigram_vocab_size,dim1,1)

train_model(model2, loss_func, optimizer, bigram_train, bigram_valid)