In [7]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)


#######################

df2 = pd.read_csv("liar_dataset/train.tsv", sep='\t')
df2.columns = ['Name', 'fraudulent', 'description', 'Weight', 'a', 'b', 'c', 'Name', 'Code', 'Age', 'Weight', 'a', 'b', 'c']

df2 = df2[['fraudulent', 'description']]

for i in range(len(df2)):
    if(df2.at[i,'fraudulent'] == 'half-true'):
        df2.at[i,'fraudulent'] = 0
    elif(df2.at[i,'fraudulent'] == 'barely-true'):
        df2.at[i,'fraudulent'] = 0
    elif(df2.at[i,'fraudulent'] == 'true'):
        df2.at[i,'fraudulent'] = 0    
    else:
        df2.at[i,'fraudulent'] = 1

df = df2

from collections import Counter
print(Counter(df['fraudulent'].values))

df_fraudulent= df[df['fraudulent'] == 1] 
df_normal = df[df['fraudulent'] == 0] 
df_normal = df_normal.sample(n=len(df_fraudulent))
df = df_normal.append(df_fraudulent)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['fraudulent'].values))


train_data = df.head(10)
print(train_data)
test_data = df.tail(10)
print(test_data)

train_data = [{'description': description, 'fraudulent': fraudulent } for description in list(train_data['description']) for fraudulent in list(train_data['fraudulent'])]
test_data = [{'description': description, 'fraudulent': fraudulent } for description in list(test_data['description']) for fraudulent in list(test_data['fraudulent'])]

train_texts, train_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), test_data)))

Counter({0: 5444, 1: 4795})
Counter({0: 4795, 1: 4795})
  fraudulent                                        description
0          0  Montana Democratic Sen. Jon Tester is the larg...
1          0  Obama "promises more taxes on small business, ...
2          1  Newt Gingrich was fined $300,000 for ethics vi...
3          0  Since the time of the Civil War, weve made a d...
4          1  Newly Elected Republican Senators Sign Pledge ...
5          1  Of our 98,000 teachers who are K-12, over 53,0...
6          1  The Pentagon made up the since-debunked heroic...
7          0  To say a family has to buy a product from a pr...
8          0   90 percent of fires in Arizona are human-caused.
9          1  Says Steve Jobs was responsible for creating m...
     fraudulent                                        description
9580          0  Says that hes responsible for Austinincluding ...
9581          1   Says Rick Perry wanted to secede from the union.
9582          0  Says Ron Johnsons comp

In [8]:
print(train_data[0]['description'])

Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.


In [9]:
train_data

[{'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 0},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 0},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 1},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 0},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 1},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 1},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 1},
 {'description': 'Montana Democratic Sen. Jon Tester is the largest recipient of lobbyist money.',
  'fraudulent': 0},
 {'description': 'Montana Democratic Sen. Jon Te

In [10]:
df = pd.read_csv("data/Constraint_Train.csv")
df.rename(columns={'label':'fraudulent', 'tweet':'description'}, inplace=True)
df = df[['description', 'fraudulent']]
for i in range(len(df)):
    if(df.at[i,'fraudulent'] == 'real'):
        df.at[i,'fraudulent'] = 0
    else:
        df.at[i,'fraudulent'] = 1
    
    


df

Unnamed: 0,description,fraudulent
0,The CDC currently reports 99031 deaths. In gen...,0
1,States reported 1121 deaths a small rise from ...,0
2,Politically Correct Woman (Almost) Uses Pandem...,1
3,#IndiaFightsCorona: We have 1524 #COVID testin...,0
4,Populous states can generate large case counts...,0
...,...,...
6415,A tiger tested positive for COVID-19 please st...,1
6416,???Autopsies prove that COVID-19 is??� a blood...,1
6417,_A post claims a COVID-19 vaccine has already ...,1
6418,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,1


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

train_y = np.array(train_labels) == 1
test_y = np.array(test_labels) == 1


class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
    
    
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()


BATCH_SIZE = 1
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


EPOCHS = 1
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
        
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))

Epoch:  1
0/100.0 loss: 0.5626635551452637 
Epoch:  1
1/100.0 loss: 0.670863538980484 
Epoch:  1
2/100.0 loss: 0.6657837430636088 
Epoch:  1
3/100.0 loss: 0.6658264994621277 
Epoch:  1
4/100.0 loss: 0.6882667422294617 
Epoch:  1
5/100.0 loss: 0.7081780433654785 
Epoch:  1
6/100.0 loss: 0.7193942240306309 
Epoch:  1
7/100.0 loss: 0.708170585334301 
Epoch:  1
8/100.0 loss: 0.6965248783429464 
Epoch:  1
9/100.0 loss: 0.6873061180114746 
Epoch:  1
10/100.0 loss: 0.6918655308810148 
Epoch:  1
11/100.0 loss: 0.6937223970890045 
Epoch:  1
12/100.0 loss: 0.6818903317818275 
Epoch:  1
13/100.0 loss: 0.6793080057416644 
Epoch:  1
14/100.0 loss: 0.6833211501439412 
Epoch:  1
15/100.0 loss: 0.6861630789935589 
Epoch:  1
16/100.0 loss: 0.6912383507279789 
Epoch:  1
17/100.0 loss: 0.708060277832879 
Epoch:  1
18/100.0 loss: 0.7081224636027688 
Epoch:  1
19/100.0 loss: 0.7038881778717041 
Epoch:  1
20/100.0 loss: 0.6946225109554472 
Epoch:  1
21/100.0 loss: 0.7021523984995756 
Epoch:  1
22/100.0 loss