In [1]:
import torch
import torch.nn as nn  # neural network modules
import torch.nn.functional as F  # activation functions
import torch.optim as optim  # optimizer
from torch.autograd import Variable # add gradients to tensors
from torch.nn import Parameter # model parameter functionality

In [2]:
import pandas as pd
import numpy as np
import csv
import itertools

from bert_embedding import BertEmbedding

In [3]:
# Read in data
df_orig = pd.read_csv("train.csv")
df = df_orig.iloc[:1200].reset_index(drop=True)

# Fix target label
label_encodings = {
    'pants-fire': 0, 
    'false':      0, 
    'barely-true':1, 
    'half-true':  1, 
    'mostly-true':2,
    'true':       2
}
df['target'] = df['label'].apply(lambda x: label_encodings[x])

### Glove Model

In [23]:
# Read Glove file
words = pd.read_table("glove.6B.100d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
# Generate Glove dictionary
glove = {word:words.iloc[idx].values for (word,idx) in zip(words.index,range(words.shape[0]))}

In [24]:
# Split the text into words
df['words'] = df['statement'].apply(lambda x: x.replace('?',' ?').replace('.',' .').\
                                    lower().split())
# Generate the list of all vocab words in our data
target_vocab = list(itertools.chain.from_iterable(df['words']))
target_vocab = list(set(target_vocab))

In [34]:
# Generate the weights_matrix, which is the matrix of word embeddings that we
# pass into Pytorch. It contains len(target_vocab) rows for each of the words,
# each represented by a length-100 vector – taken from the Glove embedding
weights_matrix = np.zeros((len(target_vocab),100))

error_count = 0
word_to_idx = {}
for i, word in enumerate(target_vocab):
    word_to_idx[word] = i
    try: 
        weights_matrix[i] = glove[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100,))
        error_count += 1
        
print(f"We found {error_count} out of {len(target_vocab)} words not in the Glove model")

We found 945 out of 4876 words not in the Glove model


In [42]:
# Encode text using the indices
df['text_idx'] = df['words'].apply(lambda lst: np.array([word_to_idx[w] for w in lst]))

In [44]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [36]:
class ToyNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers):
        super(self).__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        
    def forward(self, inp, hidden):
        return self.gru(self.embedding(inp), hidden)
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))

In [38]:
mod = ToyNN(weights_matrix, 100, 3)

TypeError: super() argument 1 must be type, not ToyNN

### BERT Model

In [9]:
# Generate BERT embeddings
bert_embedding = BertEmbedding()
df['emb'] = df['statement'].apply(lambda x: bert_embedding(x.lower().split('\n')))

# Returns one BERT embedding (i.e. list of 768 numbers) per word
# Per sentence, average over all the words
df['emb_avg'] = df['emb'].apply(lambda x: np.array(x[0][1], dtype=float).mean(axis=0))


### FakeBERT

In [93]:
class FakeBERT(nn.Module):
    def __init__(self):
        super(FakeBERT, self).__init__()
        
        # Conv1D Layer 1 + Maxpool
        self.conv_1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1)
        self.sigm_1 = nn.ReLU()
        self.pool_1 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Conv1D Layer 2 + Maxpool
        self.conv_2 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=4, stride=1)
        self.sigm_2 = nn.ReLU()
        self.pool_2 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Conv1D Layer 3 + Maxpool
        self.conv_3 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5, stride=1)
        self.sigm_3 = nn.ReLU()
        self.pool_3 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Conv1D Layer 4 + Maxpool
        self.conv_4 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5, stride=1)
        self.sigm_4 = nn.ReLU()
        self.pool_4 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Conv1D Layer 5 + Maxpool
        self.conv_5 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5, stride=1)
        self.sigm_5 = nn.ReLU()
        self.pool_5 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Fully connected layer
        self.full_1 = nn.Linear(17,3)
        self.soft_1 = nn.Softmax()

    def forward(self, x):
        # Generate the 3 1D conv layers
        conv_1 = self.pool_1(self.sigm_1(self.conv_1(x)))        
        conv_2 = self.pool_2(self.sigm_2(self.conv_2(x)))        
        conv_3 = self.pool_3(self.sigm_3(self.conv_3(x)))
        
        # Concatenate the 3 layers
        cat = torch.cat((conv_1,conv_2,conv_3),2)
        
        # Pass the concatenated output through 2 1D conv layers
        conv_4 = self.pool_4(self.sigm_4(self.conv_4(cat)))        
        conv_5 = self.pool_5(self.sigm_5(self.conv_5(conv_4)))  
        
        # Flatten the output
        flat_1 = conv_5.flatten()
        
        # Pass through fully connected layer
        full_1 = self.soft_1(self.full_1(flat_1))
        
        return full_1

In [94]:
def get_accuracy(output, targets):

    predicted = [int(y_pred.detach().argmax(-1)) for y_pred in output]
    targets = [int(y) for y in targets]
    correct = sum(a==b for (a,b) in zip(predicted, targets))
    accuracy = 100*correct/len(targets) 

    return accuracy

In [95]:
def format_x(x):
    return torch.Tensor(x).unsqueeze(0).unsqueeze(0)
def format_y(y):
    return torch.Tensor([y]).to(dtype=torch.long)

In [99]:
def train(data,
          test,
          num_epochs = 10,
          batch_size = 100,
          learning_rate = 0.01):
    
    # Instantiate model & optimization 
    model = FakeBERT()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    # Iterate over epochs
    for ep in range(num_epochs):
        
        model.train()

        # Iterate over batches
        for i in range(data.shape[0]//batch_size):
            
            # Declare loss
            loss_func = nn.CrossEntropyLoss()
            loss = torch.tensor(0.0)
            
            for idx in range(i*batch_size, (i+1)*batch_size):
                
                # Declare features and target labels
                X = format_x(data['emb_avg'][idx])
                y = format_y(data['target'][idx])

                # Get predictions from model
                pred = model(X).unsqueeze(0)

                # Calculate loss
                loss += loss_func(pred, y)
            
            # Backpropagate
            loss.backward()

            # Update parameters
            optimizer.step()

            # Reset gradients
            optimizer.zero_grad()

        # Evaluate model
        model.eval()
        
        # Prepare test data
        test_X = [format_x(x) for x in test['emb_avg']]
        test_y = [format_y(y) for y in test['target']]
            
        # Evaluate on test data
        test_pred = [model(x) for x in test_X]
        test_accuracy = get_accuracy(test_pred, test_y)
        
        # Print accuracy
        print(f"Test accuracy: {test_accuracy} at epoch: {ep}")

    return test_pred

In [100]:
data = df.iloc[:1000].reset_index(drop=True)
test = df.iloc[1000:].reset_index(drop=True)

In [101]:
test_data = train(data, test)



Test accuracy: 35.5 at epoch: 0
Test accuracy: 35.5 at epoch: 1
Test accuracy: 35.5 at epoch: 2
Test accuracy: 35.5 at epoch: 3
Test accuracy: 35.5 at epoch: 4
Test accuracy: 35.5 at epoch: 5
Test accuracy: 35.5 at epoch: 6
Test accuracy: 35.5 at epoch: 7
Test accuracy: 35.5 at epoch: 8
Test accuracy: 35.5 at epoch: 9


In [90]:
test_data

[tensor([0.2134, 0.3758, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2136, 0.3757, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2136, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2135, 0.3758, 0.4107], grad_fn=<SoftmaxBackward>),
 tensor([0.2136, 0.3757, 0.4108], grad_fn=<SoftmaxBackward>),
 tensor(