## Imports

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import time
import fasttext
import fasttext.util
from pathlib import Path
from google_drive_downloader import GoogleDriveDownloader as gdd

print("PyTorch version : ", torch.__version__)

PyTorch version :  1.6.0


## Import Data

In [2]:
df_train = pd.read_csv("Train.csv")
df_val = pd.read_csv("Val.csv")

df_train.drop(['id'], axis = 1, inplace = True)
df_val.drop(['id'], axis = 1, inplace = True)

print("Training Set:"% df_train.columns, df_train.shape, len(df_train))
print("Validation Set:"% df_val.columns, df_val.shape, len(df_val))

Training Set: (6420, 2) 6420
Validation Set: (2140, 2) 2140


In [3]:
df_train.head()

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,Populous states can generate large case counts...,real


In [4]:
if df_train['tweet'].isnull().sum() == 0 and df_train['label'].isnull().sum() == 0:
    print('There is no empty tweet or unlabel data')

There is no empty tweet or unlabel data


In [5]:
group_data = df_train.groupby('label').count()
print(group_data)

       tweet
label       
fake    3060
real    3360


## Text Cleaning

### remove non alphanumeric characters 

In [6]:
import re

def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?|[0-9]", "", elem))  
    return df

In [7]:
df_val  = clean_text(df_val, "tweet")
df_train = clean_text(df_train, "tweet")

df_train

Unnamed: 0,tweet,label
0,the cdc currently reports deaths in general t...,real
1,states reported deaths a small rise from last...,real
2,politically correct woman almost uses pandemic...,fake
3,indiafightscorona we have covid testing labor...,real
4,populous states can generate large case counts...,real
...,...,...
6415,a tiger tested positive for covid please stay ...,fake
6416,autopsies prove that covid is a blood clot not...,fake
6417,a post claims a covid vaccine has already been...,fake
6418,aamir khan donate cr in pm relief cares fund,fake


### remove stopwords

In [8]:
stops = set(stopwords.words("english"))

def cleantext(text):
    text = text.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

In [9]:
df_train['tweet'] = df_train['tweet'].map(lambda x: cleantext(x))
df_val['tweet'] = df_val['tweet'].map(lambda x: cleantext(x))

df_train

Unnamed: 0,tweet,label
0,cdc currently reports deaths general discrepan...,real
1,states reported deaths small rise last tuesday...,real
2,politically correct woman almost uses pandemic...,fake
3,indiafightscorona covid testing laboratories i...,real
4,populous states generate large case counts loo...,real
...,...,...
6415,tiger tested positive covid please stay away p...,fake
6416,autopsies prove covid blood clot pneumonia oug...,fake
6417,post claims covid vaccine already developed ca...,fake
6418,aamir khan donate cr pm relief cares fund,fake


In [10]:
print('For train data')
print("Number of tweets : ", len(df_train))
print("Longest tweet\'s length : " + str(df_train.tweet.map(len).max()))
print("Average length of the tweets : " + str(df_train.tweet.map(len).mean()))

For train data
Number of tweets :  6420
Longest tweet's length : 6412
Average length of the tweets : 113.39579439252337


In [11]:
print('For validation data')
print("Number of tweets : ", len(df_val))
print("Longest tweet\'s length : " + str(df_val.tweet.map(len).max()))
print("Average length of the tweets : " + str(df_val.tweet.map(len).mean()))

For validation data
Number of tweets :  2140
Longest tweet's length : 1293
Average length of the tweets : 111.81635514018691


### Limit the tweet max length

In [12]:
def max_text_length(text, length):
    text = text[:length]
    return text

In [13]:
L = 6420
MAX_LENGTH = 300
print('Number of posts that exceed 300 characters')
print(L - len(df_train[~(df_train.tweet.apply(lambda x : len(x)) > MAX_LENGTH)]))

df_train = df_train[~(df_train.tweet.apply(lambda x : len(x)) > MAX_LENGTH)]

df_train

Number of posts that exceed 300 characters
13


Unnamed: 0,tweet,label
0,cdc currently reports deaths general discrepan...,real
1,states reported deaths small rise last tuesday...,real
2,politically correct woman almost uses pandemic...,fake
3,indiafightscorona covid testing laboratories i...,real
4,populous states generate large case counts loo...,real
...,...,...
6415,tiger tested positive covid please stay away p...,fake
6416,autopsies prove covid blood clot pneumonia oug...,fake
6417,post claims covid vaccine already developed ca...,fake
6418,aamir khan donate cr pm relief cares fund,fake


In [14]:
df_train.to_csv("train.tsv", sep="\t", index=False)
df_val.to_csv("validation.tsv", sep="\t", index=False)

In [15]:
T = pd.read_csv('train.tsv', sep='\t')
T1, T2 = T.iloc[0, :].values
print(T1)
print(T2)

cdc currently reports deaths general discrepancies death counts different sources small explicable death toll stands roughly people today
real


## Load pre-trained fasttext model

In [16]:
bin_PATH = 'cc.en.300.bin'
if not Path(bin_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1iyOdoE3cYbhTRF-J6psKCH9W8gagd1sm',
        dest_path='./'+bin_PATH,
        )

In [17]:
%%time
ft = fasttext.load_model('cc.en.300.bin')
fasttext.util.reduce_model(ft, 100)



Wall time: 1min 28s


<fasttext.FastText._FastText at 0x1e7f7f60>

## Make Dataset

In [18]:
tokenizer = nltk.word_tokenize
w2v = ft.get_word_vector

class FakeNewsDataset(Dataset):
    # read the tsv we make and initialize some parameters
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "validation", "test"]
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'real': 0, 'fake': 1}
        self.tokenizer = tokenizer  # use nltk tokenizer
    
    # define a function that reutrn a training or testing data
    def __getitem__(self, idx):
        if self.mode == "test" or self.mode == "validation":
            text = self.df.iloc[idx, 0]
            label_tensor = None
        else:
            text, label = self.df.iloc[idx, :].values
            # convert text label into index, which is more convenient to convert into tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # tokenize
        word_pieces = []
        tokens_text = self.tokenizer(text)
        len_text = len(word_pieces)
        
        # convert hole token sequence into word vector
        ids = map(w2v, tokens_text)
        ids_list = list(ids)
        tokens_tensor = torch.tensor(ids_list)

        
        return (tokens_tensor, label_tensor)
    
    def __len__(self):
        return self.len


### Make trainset

In [19]:
# Dataset
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

## DataLoader

In [20]:
# The input samples of this function is a list,
# every element in it is a sample return by the 'FakeNewsDataset'

# Every sample contains 2 tensors : 
# - tokens_tensor
# - label_tensor

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    
    # with labels or not
    if samples[0][1] is not None:
        label_ids = torch.stack([s[1] for s in samples])
    else:
        label_ids = None
    
    # zero pading
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    
    return tokens_tensors, label_ids

### trainloader

In [21]:
# Initialize a DataLoader
# use `collate_fn` to combine list of samples into a mini-batch
BATCH_SIZE = 234

trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [22]:
data = next(iter(trainloader))

tokens_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}

------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([234, 36, 100]) 
tensor([[[-4.2262e-01, -5.8568e-01,  1.2505e-01,  ...,  2.6792e-01,
          -8.4427e-02, -4.7673e-02],
         [ 1.6658e-02, -1.2981e-02,  5.4732e-02,  ...,  6.2373e-03,
           1.7682e-02,  2.9276e-02],
         [ 5.9510e-02, -3.1535e-02,  1.3201e-02,  ...,  6.3179e-02,
           1.0543e-02, -6.6070e-02],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[ 8.5433e-02, -6.4166e-02, -7.8663e-03,  ..., -8.4811e-03,
           2.1073e-02,  8.2014e-02],
         [ 3.1490e-02, -7.2415e-03, -3.5448e-03,  ...,  1.2513e-02,
           1.1206e-02, -2.7145e-02],
         [ 9.8651e-03, -8.7715e-02, -1.0489e-01,  ...,  5.2346e-02,
          -6.6690e-03,

## Bi-LSTM model

In [23]:
# check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
num_classes = 2
EPOCHS = 15
learning_rate = 0.01

input_size = 100
hidden_size = 128
num_layers = 2

In [24]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [25]:
model = BiLSTM(input_size, hidden_size, num_layers, num_classes).to(device)

## Prediction

In [26]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors = data[0]
            outputs = model(tokens_tensors)
            
            _, pred = torch.max(outputs, 1)
            
            # calculate accuracy when training
            if compute_acc:
                labels = data[1]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # record current batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    

## Start training

In [27]:
%%time

# train model
model.train()

# using Adam Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, labels = [t.to(device) for t in data]

        # set the gradients to zero
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(tokens_tensors)

        loss = criterion(outputs, labels)
        
        # record batch loss
        running_loss += loss.item()
        
        # backward
        loss.backward()
        optimizer.step()
        
    # calculate accuracy
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 19.463, acc: 0.525
[epoch 2] loss: 19.445, acc: 0.527
[epoch 3] loss: 19.306, acc: 0.561
[epoch 4] loss: 18.807, acc: 0.700
[epoch 5] loss: 17.086, acc: 0.741
[epoch 6] loss: 14.123, acc: 0.825
[epoch 7] loss: 10.425, acc: 0.846
[epoch 8] loss: 9.295, acc: 0.872
[epoch 9] loss: 8.214, acc: 0.888
[epoch 10] loss: 7.671, acc: 0.886
[epoch 11] loss: 7.175, acc: 0.872
[epoch 12] loss: 7.031, acc: 0.894
[epoch 13] loss: 6.361, acc: 0.922
[epoch 14] loss: 5.933, acc: 0.927
[epoch 15] loss: 5.480, acc: 0.936
Wall time: 2min 11s


## Validation

In [28]:
valiset = FakeNewsDataset("validation", tokenizer=tokenizer)

In [29]:
valiloader = DataLoader(valiset, batch_size=64, collate_fn=create_mini_batch)

with torch.no_grad():
    validations = get_predictions(model, valiloader)
torch.cuda.empty_cache()

In [30]:
validations_numpy = validations.cpu().clone().numpy()
validations_numpy = validations_numpy.reshape((2140, 1))

In [31]:
ids = np.arange(1,2141)
ids = ids.reshape((2140, 1))

In [32]:
validations_array = np.concatenate((ids, validations_numpy), axis=1)
validations_array

array([[   1,    1],
       [   2,    1],
       [   3,    1],
       ...,
       [2138,    1],
       [2139,    0],
       [2140,    0]], dtype=int64)

In [33]:
df_validations = pd.DataFrame(data = validations_array, columns=["id", "label"])
df_validations['label'] = df_validations['label'].map({0:'real', 1:'fake'})
# df_validations.to_csv('answer.txt', index = False)

### Calculate F1 score

In [34]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

f1 = f1_score(df_val['label'], df_validations['label'], average = 'weighted')
Acc = accuracy_score(df_val['label'], df_validations['label'])

print('f1: %.3f, acc: %.3f' % (f1, Acc))

f1: 0.883, acc: 0.883
