In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Suppress Warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import collections
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
import math

In [4]:
device = torch.device("cuda")
torch.manual_seed(0)

<torch._C.Generator at 0x7f5bb8c7fba0>

In [5]:
lan_codes = ['en', 'es', 'pt', 'gl', 'eu', 'ca', 'fr', 'it', 'de']
Languages = ['English', 'Spanish', 'Portuguese', 'Galician', 'Basque', 'Catalan', 'French', 'Italian', 'German']

## Load the Data, Calculate Vocabulary and Perplexity

In [6]:
def load_data(filename):
    data = pd.read_csv(filename, header=None, sep='\t', quoting=3)
    data.columns = ['lan','tweet']
    return data

In [7]:
def get_freq(data, vocab):
    freq = np.zeros(len(vocab))
    for tweet in data.tweet:
        for char in tweet:
            if char in vocab:
                freq[vocab.index(char)] += 1
            else:
                freq[vocab.index('<N>')] += 1
        freq[vocab.index('</S>')] += 1
    return freq

In [10]:
class Data():
    def __init__(self):
        self.train = load_data('/content/drive/MyDrive/Data/train.tsv')
        self.val = load_data('/content/drive/MyDrive/Data/val.tsv')
        self.test = load_data('/content/drive/MyDrive/Data/test.tsv')
        self.get_vocab()
        self.train_freq = get_freq(self.train, self.vocab)
        self.val_freq = get_freq(self.val, self.vocab)

    def get_vocab(self):
        chars = [i for ele in self.train.tweet.to_list() for i in ele]
        most_common = Counter(chars).most_common()
        char_limit = 10
        for k in range(len(most_common)):
            if most_common[k][1] < char_limit:
                break

        vocab = [i[0] for i in most_common[:k]]
        vocab.insert(0,'<S>')   # start token
        vocab.insert(0,'</S>')  # end token
        vocab.insert(0,'<N>')   # out-of-vocabulary token
        
        self.vocab = vocab
    
    def get_perplexity(self):
        train_freq = self.train_freq / self.train_freq.sum()
        val_freq = self.val_freq / self.val_freq.sum()
        train_freq[self.vocab.index('<S>')] = 1
        return np.exp(-(val_freq * np.log(train_freq)).sum())

# Load Data
data = Data()
print('Size of the vocabulary: %d characters' % len(data.vocab))

Size of the vocabulary: 509 characters


In [11]:
preplexity = data.get_perplexity()
print('Perplexity measurement is %.2f' % preplexity)

Perplexity measurement is 34.11


In [12]:
print('Percent of Invalid Characters - Train: %.5f%%' 
    % ((data.train_freq[data.vocab.index('<N>')] / data.train_freq.sum()) * 100.0))
print('Percent of Invalid Characters - Val: %.5f%%' 
    % ((data.val_freq[data.vocab.index('<N>')] / data.val_freq.sum()) * 100.0))

Percent of Invalid Characters - Train: 0.04622%
Percent of Invalid Characters - Val: 0.05987%


## Process the data for the model

In [13]:
def get_data_loader(tweets, lans, batch_size, shuffle=False):
    data_tensor = torch.tensor(tweets, dtype=torch.long, device=device)
    label_tensor = torch.tensor(lans, dtype=torch.long, device=device)
    train_dataset = TensorDataset(data_tensor, label_tensor)
    return DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=shuffle) 

def data_encoding(data, vocab, languages):
    tweets = tweet_enconding(data.tweet, vocab)
    langs = lang_encoding(data.lan, languages)
    return tweets, langs

def tweet_enconding(tweets, vocab, tweet_length=282):
    encoded = np.zeros((len(tweets), tweet_length))
    for t, tweet in enumerate(tweets):
        encoded[t][0] = vocab.index('<S>')
        for char in range(1, tweet_length-1):
            if char < len(tweet) and tweet[char] in vocab:
                encoded[t][char] = vocab.index(tweet[char])
            elif char < len(tweet):
                encoded[t][char] = vocab.index('<N>')
            else:
                encoded[t][char] = vocab.index('</S>')
        encoded[t][tweet_length-1] = vocab.index('</S>')
    return encoded

def lang_encoding(labels, languages, tweet_length=282):
    encoded = np.zeros((len(labels), tweet_length))
    for l, lang in enumerate(labels):
        idx = languages.index(lang)
        for char in range(0, tweet_length):
            encoded[l][char] = idx
    return encoded

In [14]:
# this cell takes a few minutes to run
train_tweets, train_lans = data_encoding(data.train, data.vocab, lan_codes)
val_tweets, val_lans = data_encoding(data.val, data.vocab, lan_codes)
test_tweets, test_lans = data_encoding(data.test, data.vocab, lan_codes)

## Model

In [15]:
def train(model, train_loader, optimizer, epochs, verbose=False):
    for epoch in range(epochs):
        model.train()
        for data, label in train_loader:
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            output, hidden = model(data, label)
            loss = loss_function(output, label)
            loss.backward()
            optimizer.step()
        if verbose:
          print('Epoch: %d \tLoss: %.6f' % (epoch, loss.item()))

def test(model, test_loader, pad):
    model.eval()
    loss = 0
    perp = 0
    with torch.no_grad():
        for data, label in test_loader:
            data, label = data.to(device), label.to(device)
            output, hidden = model(data, label)
            loss += loss_function(output, label).item()
            perp += math.exp(F.cross_entropy(output.view(-1, 509), label.view(-1), ignore_index=pad))
    return loss, perp

In [124]:
class RNN(nn.Module):
    def __init__(self, initial_weights):
        super(RNN, self).__init__()
        self.char_encoder = nn.Embedding(509, 14).cuda()
        self.lang_encoder = nn.Embedding(9, 4).cuda()
        self.gru = nn.GRU(18, 50).cuda()
        self.fc1 = nn.Linear(50, 14).cuda()
        self.fc2 = nn.Linear(14, 509).cuda()
        self.softmax = nn.LogSoftmax().cuda()
        self.criterion = nn.NLLLoss(initial_weights, size_average=False).cuda()
        self.initialize_fc2

    def initialize_fc2():
        self.fc2.weight = self.char_encoder.weight
        self.fc2.bias.data.zero_()

    def forward(self, tweets, lang, hidden=None):
        x = torch.cat((self.lang_encoder(lang), self.char_encoder(tweets)), -1)
        x, h = self.gru(x, hidden)
        x = F.tanh(self.fc1(x))
        return self.fc2(x), h

initial = torch.ones(509)
initial[data.vocab.index('</S>')] = 0
model = RNN(initial)

In [125]:
def loss_function(guess, label):
  return model.criterion(model.softmax(guess.view(-1, 509)), label.view(-1))

In [126]:
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 0.001
DECAY = 0.0005

In [127]:
train_loader = get_data_loader(train_tweets, train_lans, BATCH_SIZE, shuffle=True)
val_loader = get_data_loader(val_tweets, val_lans, BATCH_SIZE)

In [128]:
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=DECAY)

In [129]:
train(model, train_loader, optimizer, EPOCHS, verbose=True)

Epoch: 0 	Loss: 591.691345
Epoch: 1 	Loss: 67.239189
Epoch: 2 	Loss: 24.346916
Epoch: 3 	Loss: 13.931030
Epoch: 4 	Loss: 6.333050
Epoch: 5 	Loss: 4.023886
Epoch: 6 	Loss: 1.196495
Epoch: 7 	Loss: 1.045545
Epoch: 8 	Loss: 0.601413
Epoch: 9 	Loss: 0.419781
Epoch: 10 	Loss: 0.209214
Epoch: 11 	Loss: 0.085225
Epoch: 12 	Loss: 0.026110
Epoch: 13 	Loss: 0.038447
Epoch: 14 	Loss: 0.022037
Epoch: 15 	Loss: 0.012356
Epoch: 16 	Loss: 0.009294
Epoch: 17 	Loss: 0.026798
Epoch: 18 	Loss: 0.016216
Epoch: 19 	Loss: 0.005877


In [130]:
test(model, val_loader, data.vocab.index('</S>'))

(1.8001482766121626, 184.00014343843245)

In [131]:
tweets_test = torch.tensor(test_tweets, dtype=torch.long, device=device)
language_val = torch.tensor(test_lans, dtype=torch.long, device=device)

In [132]:
def predict(model, device, data):
    '''
    lan - language id (0-8)
    '''
    model.eval()
    with torch.no_grad():
        for lan in range(9):
            label = torch.ones(data.size(), dtype=torch.long)*lan
            data, label = data.to(device), label.to(device)
            output, hidden = model(data, label)
            #convert to numpy
            data_np = data.cpu().numpy()
            output_np = output.cpu().numpy()

            # calculate log prob for each letter of sequence (using output matrix)     
            prob = np.zeros(data_np.shape)
            for batch in range(output_np.shape[0]):
                for char in range(output_np.shape[1]):
                    prob[batch, char] = output_np[batch, char, data_np[batch, char]]

            if lan == 0:
                total_prob = np.sum(prob, axis=1)
            else:
                total_prob = np.vstack((np.sum(prob, axis=1),total_prob))
        
        # Choose language with highest character probability
        output = np.argmax(total_prob,axis=0)
        return output

pred = predict(model, device, tweets_test[:5000,:])
print(f'Percent Correct: {np.sum(pred == test_lans[:5000,0])/pred.shape[0]*100}')

Percent Correct: 19.88


In [19]:
def Metrics(preds, labs, show=True):
  """Print precision, recall and F1 for each language.
  Assumes a single language per example, i.e. no code switching.
  Args:
    preds: list of predictions
    labs: list of labels
    show: flag to toggle printing
  """
  all_langs = set(preds + labs)
  preds = np.array(preds)
  labs = np.array(labs)
  label_totals = collections.Counter(labs)
  pred_totals = collections.Counter(preds)
  confusion_matrix = collections.Counter(zip(preds, labs))
  num_correct = 0
  for lang in all_langs:
    num_correct += confusion_matrix[(lang, lang)]
  acc = num_correct / float(len(preds))
  print('accuracy = {0:.3f}'.format(acc))
  if show:
    print(' Lang     Prec.   Rec.   F1')
    print('------------------------------')
  scores = []
  fmt_str = '  {0:6}  {1:6.2f} {2:6.2f} {3:6.2f}'
  for lang in sorted(all_langs):
    idx = preds == lang
    total = max(1.0, pred_totals[lang])
    precision = 100.0 * confusion_matrix[(lang, lang)] / total
    idx = labs == lang
    total = max(1.0, label_totals[lang])
    recall = 100.0 * confusion_matrix[(lang, lang)] / total
    if precision + recall == 0.0:
      f1 = 0.0
    else:
      f1 = 2.0 * precision * recall / (precision + recall)
    scores.append([precision, recall, f1])
    if show:
      print(fmt_str.format(lang, precision, recall, f1))
  totals = np.array(scores).mean(axis=0)
  if show:
    print('------------------------------')
    print(fmt_str.format('Total:', totals[0], totals[1], totals[2]))
  return totals[2]

class MovingAvg(object):
  def __init__(self, p):
    self.val = None
    self.p = p

  def Update(self, v):
    if self.val is None:
      self.val = v
      return v
    self.val = self.p * self.val + (1.0 - self.p) * v
    return self.val

Metrics(pred, test_lans[:5000,0])

accuracy = 0.199
  Lang     Prec.   Rec.   F1
 ------------------------------
      0.0   30.16  57.34  39.53
      1.0    0.00   0.00   0.00
      2.0    2.78  10.91   4.43
      3.0    0.00   0.00   0.00
      4.0    5.91  45.83  10.46
      5.0    0.00   0.00   0.00
      6.0    3.83  16.88   6.25
      7.0    0.00   0.00   0.00
      8.0    0.00   0.00   0.00
 ------------------------------
   Total:    2.85   8.73   4.04
4.044627144291371
