## IMPORTS AND LOADINGS

In [1]:
import os
import re
import torch
import random
import fasttext
import numpy as np
from glob import glob
from torchtext import data, datasets

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
fasttext_file_path = "../../Dataset/wiki.en/wiki.en.bin"

dataset_folder = '../../Dataset/crosslingual_transfer/EI-oc/'
train_set = glob(dataset_folder + 'training/*.txt') 
development_set = glob(dataset_folder + 'development/*.txt') 
test_set = glob(dataset_folder + 'test/*.txt') 

In [4]:
class_labels = {'anger':0, 'fear':1, 'joy':2, 'sadness':3}

In [5]:
def get_data(files): #list of files
    x_data, y_data = [], []
    for file in files:
        with open(file) as file:
            for line in file:
                line = line.strip().split('\t')
                tweet, label = line[1], line[2]
                x_data.append(tweet)
                try:
                    y_data.append(class_labels[label])
                except KeyError:
                    continue
    return x_data[1:], y_data[1:]

In [6]:
# all are lists 
x_train, y_train = get_data(train_set)
x_dev, y_dev = get_data(development_set)
x_test, y_test = get_data(test_set)
trg_samples = len(x_train)
dev_samples = len(x_dev)
test_samples = len(x_test)
print("samples in trg set : ", trg_samples)
print("samples in dev set : ", dev_samples)
print("samples in test set : ", test_samples)

samples in trg set :  7105
samples in dev set :  1467
samples in test set :  4071


## DATA PRE-PROCESSING

In [8]:
ft = fasttext.load_model(fasttext_file_path)




In [22]:
embedding_dims = ft.get_dimension()
embedding_dims

300

In [72]:
def twitter_tokenizer(textline):
    textLine = re.sub(r'http\S+', 'URL', textline)
    textline = re.sub('@[\w_]+', '', textline)
    textline = re.sub('\|LBR\|', '', textline)
    textline = re.sub('#', '', textline)
    textline = re.sub('\.\.\.+', '...', textline)
    textline = re.sub('!!+', '!!', textline)
    textline = re.sub('\?\?+', '??', textline)
    words = re.compile('[\U00010000-\U0010ffff]|[\w-]+|[^ \w\U00010000-\U0010ffff]+', re.UNICODE).findall(textline.strip())
    words = [w.strip() for w in words if w.strip() != '']
    return(words)

In [75]:
print(x_train[8])
twitter_tokenizer(x_train[8])

@City_tv  is like the big bully in class ruining everyone's lunch but instead of taking our lunch money they took away family feud #bully


['is',
 'like',
 'the',
 'big',
 'bully',
 'in',
 'class',
 'ruining',
 'everyone',
 "'",
 's',
 'lunch',
 'but',
 'instead',
 'of',
 'taking',
 'our',
 'lunch',
 'money',
 'they',
 'took',
 'away',
 'family',
 'feud',
 'bully']

## MODEL TRAINING