In [1]:
import torch
import torch.nn as nn  # neural network modules
import torch.nn.functional as F  # activation functions
import torch.optim as optim  # optimizer
from torch.autograd import Variable # add gradients to tensors
from torch.nn import Parameter # model parameter functionality 

In [2]:
import pandas as pd
import numpy as np
import csv
import itertools

from bert_embedding import BertEmbedding

In [3]:
# Read in data
df_orig = pd.read_csv("train.csv")
df = df_orig

# Fix target label
label_encodings = {
    'pants-fire': 0, 
    'false':      0, 
    'barely-true':1, 
    'half-true':  1, 
    'mostly-true':2,
    'true':       2
}
df['target'] = df['label'].apply(lambda x: label_encodings[x])

### Glove Model

In [4]:
# Read Glove file
words = pd.read_table("../glove.6B.100d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

# Generate Glove dictionary
glove = {word:words.iloc[idx].values for (word,idx) in zip(words.index,range(words.shape[0]))}

In [5]:
# Split the text into words
df['words'] = df['statement'].apply(lambda x: x.replace('?',' ?').replace('.',' .').\
                                    lower().split())

# Generate the list of all vocab words in our data
target_vocab = list(itertools.chain.from_iterable(df['words']))
target_vocab = list(set(target_vocab))

In [6]:
# Generate the weights_matrix, which is the matrix of word embeddings that we
# pass into Pytorch. It contains len(target_vocab) rows for each of the words,
# each represented by a length-100 vector – taken from the Glove embedding
weights_matrix = np.zeros((len(target_vocab)+1,100))

# Add in vocab
error_count = 0
word_to_idx = {}
for i, word in enumerate(target_vocab):
    word_to_idx[word] = i
    try: 
        weights_matrix[i] = glove[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100,))
        error_count += 1

# Add in "empty" token
word_to_idx[""] = len(target_vocab)
        
print(f"We found {error_count} out of {len(target_vocab)} words not in the Glove model")

We found 5391 out of 17019 words not in the Glove model


In [7]:
# Pad the text to have length 100
df['words'] = df['words'].apply(lambda x: x+([""]*(100-len(x))))

# Encode text using the indices
df['text_idx'] = df['words'].apply(lambda lst: np.array([word_to_idx[w] for w in lst]))

In [8]:
def create_emb_layer(weights_matrix, 
                     non_trainable=False):
    weights_matrix = torch.Tensor(weights_matrix)
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

### FakeBERT

In [38]:
def get_accuracy(output, targets):

    predicted = [int(y_pred.detach().argmax(-1)) for y_pred in output]
    targets = [int(y) for y in targets]
    correct = sum(a==b for (a,b) in zip(predicted, targets))
    accuracy = 100*correct/len(targets) 

    return accuracy

def train(data,
          test,
          weights_matrix,
          num_epochs = 10,
          batch_size = 100,
          learning_rate = 0.01):
    
    # Instantiate model & optimization 
    model = FakeBERT(weights_matrix)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    # Iterate over epochs
    for ep in range(num_epochs):
        
        model.train()

        # Iterate over batches
        for i in range(data.shape[0]//batch_size):
                            
            # Declare features and target labels
            X = torch.Tensor([i for i in data['text_idx'][i*batch_size:(i+1)*batch_size]]).to(dtype=torch.long)
            y = data['target'][i*batch_size:(i+1)*batch_size].values
            y = torch.Tensor(y).to(dtype=torch.long)

            # Get predictions from model
            pred = model(X)

            # Calculate loss
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(pred, y)

            # Backpropagate
            loss.backward()

            # Update parameters
            optimizer.step()

            # Reset gradients
            optimizer.zero_grad()

        # Evaluate model
        model.eval()
        
        # Prepare test data
        test_X = torch.Tensor([i for i in test['text_idx']]).to(dtype=torch.long)
        test_y = test['target'].values
        test_y = torch.Tensor(test_y).to(dtype=torch.long)
            
        # Evaluate on test data
        test_pred = model(test_X)
        test_accuracy = get_accuracy(test_pred, test_y)
        
        # Print accuracy
        print(f"Test accuracy: {test_accuracy} at epoch: {ep}")

    return test_pred

In [39]:
class FakeBERT(nn.Module):
    def __init__(self, weights_matrix):
        super(FakeBERT, self).__init__()
        
        # Layer 0: Embedding Layer
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)

        # Layer 1: Conv1D + Maxpool
        self.conv_1 = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3, stride=1)
        self.sigm_1 = nn.ReLU()
        self.pool_1 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 2: Conv1D + Maxpool
        self.conv_2 = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=4, stride=1)
        self.sigm_2 = nn.ReLU()
        self.pool_2 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 3: Conv1D + Maxpool
        self.conv_3 = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=5, stride=1)
        self.sigm_3 = nn.ReLU()
        self.pool_3 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 4: Conv1D + Maxpool
        self.conv_4 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, stride=1)
        self.sigm_4 = nn.ReLU()
        self.pool_4 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 5: Conv1D + Maxpool
        self.conv_5 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, stride=1)
        self.sigm_5 = nn.ReLU()
        self.pool_5 = nn.MaxPool1d(kernel_size=5, stride=5)
        
        # Layer 6: Fully Connected Layer 
        self.full_6 = nn.Linear(128,32)
        self.sigm_6 = nn.Sigmoid()
        
        # Layer 7: Fully Connected Layer 
        self.full_7 = nn.Linear(32,3)
        self.soft_7 = nn.Softmax()
        
    def forward(self, x):
        # Generate the embeddings with Glove
        emb = self.embedding(x)

        # Generate the 3 1D conv layers
        conv_1 = self.pool_1(self.sigm_1(self.conv_1(emb)))        
        conv_2 = self.pool_2(self.sigm_2(self.conv_2(emb)))        
        conv_3 = self.pool_3(self.sigm_3(self.conv_3(emb)))
        
        # Concatenate the 3 layers
        cat = torch.cat((conv_1,conv_2,conv_3),2)
        
        # Pass the concatenated output through 2 1D conv layers
        conv_4 = self.pool_4(self.sigm_4(self.conv_4(cat)))        
        conv_5 = self.pool_5(self.sigm_5(self.conv_5(conv_4)))  

        # Flatten the output
        flat = conv_5.flatten(start_dim=1)

        # Pass through 2 fully connected layers
        full_6 = self.sigm_6(self.full_6(flat))
        full_7 = self.soft_7(self.full_7(full_6))
        
        return full_7

In [41]:
# Keep sentences to 100 words max
df['text_idx'] = [i[:100] for i in df['text_idx']]

# Separate train and test
data = df.iloc[:10000].reset_index(drop=True)
test = df.iloc[10000:].reset_index(drop=True)

In [42]:
# Train the model
test_data = train(data, test, weights_matrix, num_epochs=1000)



Test accuracy: 36.666666666666664 at epoch: 0
Test accuracy: 36.666666666666664 at epoch: 1
Test accuracy: 36.666666666666664 at epoch: 2
Test accuracy: 36.666666666666664 at epoch: 3
Test accuracy: 35.0 at epoch: 4
Test accuracy: 35.0 at epoch: 5
Test accuracy: 35.0 at epoch: 6
Test accuracy: 35.0 at epoch: 7
Test accuracy: 35.0 at epoch: 8
Test accuracy: 35.0 at epoch: 9
Test accuracy: 35.0 at epoch: 10
Test accuracy: 35.0 at epoch: 11
Test accuracy: 35.0 at epoch: 12
Test accuracy: 35.0 at epoch: 13
Test accuracy: 35.0 at epoch: 14
Test accuracy: 35.0 at epoch: 15
Test accuracy: 35.0 at epoch: 16
Test accuracy: 35.0 at epoch: 17
Test accuracy: 35.0 at epoch: 18
Test accuracy: 35.0 at epoch: 19
Test accuracy: 35.0 at epoch: 20
Test accuracy: 35.0 at epoch: 21
Test accuracy: 35.0 at epoch: 22
Test accuracy: 35.0 at epoch: 23
Test accuracy: 35.0 at epoch: 24
Test accuracy: 35.0 at epoch: 25
Test accuracy: 35.0 at epoch: 26
Test accuracy: 35.0 at epoch: 27
Test accuracy: 35.0 at epoch: 