In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

import torch
import torch.nn as nn  # neural network modules
import torch.nn.functional as F  # activation functions
import torch.optim as optim  # optimizer
from torch.autograd import Variable # add gradients to tensors
from torch.nn import Parameter # model parameter functionality 

In [2]:
# Target labels
label_encodings3 = {
    'pants-fire': 0, 
    'false':      0, 
    'barely-true':1, 
    'half-true':  1, 
    'mostly-true':2,
    'true':       2
}
label_encodings6 = {
    'pants-fire': 0, 
    'false':      1, 
    'barely-true':2, 
    'half-true':  3, 
    'mostly-true':4,
    'true':       5
}

In [3]:
# Load
df = pd.read_csv("Data/liar_dataset/train.csv")

# Relabel
df['target'] = df['label'].apply(lambda x: label_encodings3[x])

In [4]:
# TF-IDF on Unigrams
corpus = list(df['statement'])
vectorizer = TfidfVectorizer(stop_words = 'english')
X1 = vectorizer.fit_transform(corpus).toarray()
X1 = torch.Tensor(X1)

vectorizer2 = TfidfVectorizer(stop_words = 'english', 
                              ngram_range=(1,2))
X2 = vectorizer2.fit_transform(corpus).toarray()
X2 = torch.Tensor(X2)

In [5]:
class DNN(nn.Module):
    def __init__(self, corpus_size, num_classes):
        super(DNN, self).__init__()
        
        self.full_1 = nn.Linear(corpus_size, 1024)
        self.drop_1 = nn.Dropout(p=0.2)
        self.nonl_1 = nn.ReLU()
        
        self.full_2 = nn.Linear(1024, 256)
        self.drop_2 = nn.Dropout(p=0.2)
        self.nonl_2 = nn.ReLU()

        self.full_3 = nn.Linear(256, 64)
        self.drop_3 = nn.Dropout(p=0.2)
        self.nonl_3 = nn.ReLU()

        self.full_4 = nn.Linear(64, 16)
        self.drop_4 = nn.Dropout(p=0.2)
        self.nonl_4 = nn.ReLU()
        
        self.full_5 = nn.Linear(16, num_classes)
        self.drop_5 = nn.Dropout(p=0.2)
        self.nonl_5 = nn.Softmax()
        
    def forward(self, x):
        result = self.nonl_1(self.drop_1(self.full_1(x)))
        result = self.nonl_2(self.drop_2(self.full_2(result)))
        result = self.nonl_3(self.drop_3(self.full_3(result)))
        result = self.nonl_4(self.drop_4(self.full_4(result)))
        result = self.nonl_5(self.drop_5(self.full_5(result)))
        return result

In [6]:
def get_accuracy(output, targets):

    predicted = [int(y_pred.detach().argmax(-1)) for y_pred in output]
    targets = [int(y) for y in targets]
    correct = sum(a==b for (a,b) in zip(predicted, targets))
    accuracy = 100*correct/len(targets) 

    return accuracy

def train(data_X,
          test_X,
          data_y,
          test_y,
          corpus_size,
          num_classes = 3,
          num_epochs = 10,
          batch_size = 100,
          learning_rate = 0.01):
    
    # Instantiate model & optimization 
    model = DNN(corpus_size, num_classes)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Prepare test data
    test_y = torch.Tensor(test_y.values).to(dtype=torch.long)
    
    # Iterate over epochs
    for ep in range(num_epochs):
        
        model.train()

        # Iterate over batches
        for i in range(data_X.shape[0]//batch_size):
            # Reset gradients
            optimizer.zero_grad()
            
            # Declare features and target labels
            X = data_X[i*batch_size:(i+1)*batch_size]
            y = data_y[i*batch_size:(i+1)*batch_size].values
            y = torch.Tensor(y).to(dtype=torch.long)

            # Get predictions from model
            pred = model(X)

            # Calculate loss
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(pred, y)

            # Backpropagate
            loss.backward()

            # Update parameters
            optimizer.step()

        # Evaluate model
        model.eval()
            
        # Evaluate on test data
        test_pred = model(test_X)
        test_accuracy = get_accuracy(test_pred, test_y)
        
        # Print accuracy
        print(f"Test accuracy: {test_accuracy} at epoch: {ep}")

    return test_pred

In [10]:
# Separate train and test
data_X = X2[:10000]
test_X = X2[10000:]
data_y = df['target'][:10000]
test_y = df['target'][10000:]

In [11]:
test_data = train(data_X, test_X, data_y, test_y, 
                  corpus_size=X2.shape[1],
                  num_classes=3,
                  num_epochs=10,
                  learning_rate=1e-4
                  )



Test accuracy: 35.0 at epoch: 0
Test accuracy: 35.0 at epoch: 1
Test accuracy: 43.333333333333336 at epoch: 2
Test accuracy: 45.416666666666664 at epoch: 3
Test accuracy: 43.75 at epoch: 4
Test accuracy: 40.833333333333336 at epoch: 5
Test accuracy: 41.666666666666664 at epoch: 6
Test accuracy: 42.5 at epoch: 7
Test accuracy: 41.666666666666664 at epoch: 8
Test accuracy: 43.75 at epoch: 9
