In [None]:
import pandas as pd
import torch as torch
from pytorch_pretrained_bert import BertModel
from pytorch_pretrained_bert import BertTokenizer

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import statistics

In [None]:
import time

In [None]:
from functools import partial
# from dataclasses import dataclass
from collections import OrderedDict

In [None]:
import torch.nn as nn

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
bert_tokenizer=BertTokenizer.from_pretrained('bertPytorch/bert-base-cased')
embedding_model=BertModel.from_pretrained('bertPytorch/bert-base-cased')

In [None]:
embedding_model.cuda()

In [None]:
def BertEmbedding(ids,masks):
#         input_token = ['[CLS]'] + bert_tokenizer.tokenize(text_as_input) + ['[SEP]']
#         input_id = bert_tokenizer.convert_tokens_to_ids(input_token)
#         input_id=[input_id]
#         input_ids = pad_sequences(input_id, maxlen=128, dtype="long", truncating="post", padding="post")
#         attention_masks = []
#         for seq in text_as_input_ids:
#             for i in seq:
#                 if i > 0:
#                     attention_masks.append(1)
#                 else:
#                     attention_masks.append(0)
#         masks=[attention_masks]
#         masks=torch.LongTensor(masks)
#         input_ids=torch.LongTensor(text_as_input_ids)
        embeddings = embedding_model(ids,masks)
        embeddings = embeddings[0]
        
        return embeddings

In [None]:
class Conv2dAuto(nn.Conv2d):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.padding =  (self.kernel_size[0] // 2, self.kernel_size[1] // 2) # dynamic add padding based on the kernel_size
        
conv3x3 = partial(Conv2dAuto, kernel_size=3, bias=False) 

In [None]:
conv = conv3x3(in_channels=32, out_channels=64)
print(conv)
del conv

In [None]:
def activation_func(activation):
    return  nn.ModuleDict([
        ['relu', nn.ReLU(inplace=True)],
        ['leaky_relu', nn.LeakyReLU(negative_slope=0.01, inplace=True)],
        ['selu', nn.SELU(inplace=True)],
        ['sigmoid',nn.Sigmoid()],
        ['none', nn.Identity()]
    ])[activation]

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, activation='relu'):
        super().__init__()
        self.in_channels, self.out_channels, self.activation = in_channels, out_channels, activation
        self.blocks = nn.Identity()
        self.activate = activation_func(activation)
        self.shortcut = nn.Identity()   
    
    def forward(self, x):
        residual = x
        if self.should_apply_shortcut: residual = self.shortcut(x)
        x = self.blocks(x)
        x += residual
        x = self.activate(x)
        return x
    
    @property
    def should_apply_shortcut(self):
        return self.in_channels != self.out_channels

In [None]:
dummy = torch.ones((1, 1, 1, 1))

block = ResidualBlock(1, 64)
block(dummy)

In [None]:
class ResNetResidualBlock(ResidualBlock):
    def __init__(self, in_channels, out_channels, expansion=1, downsampling=1, conv=conv3x3, *args, **kwargs):
        super().__init__(in_channels, out_channels, *args, **kwargs)
        self.expansion, self.downsampling, self.conv = expansion, downsampling, conv
        self.shortcut = nn.Sequential(
            nn.Conv2d(self.in_channels, self.expanded_channels, kernel_size=1,
                      stride=self.downsampling, bias=False),
            nn.BatchNorm2d(self.expanded_channels)) if self.should_apply_shortcut else None
        
        
    @property
    def expanded_channels(self):
        return self.out_channels * self.expansion
    
    @property
    def should_apply_shortcut(self):
        return self.in_channels != self.expanded_channels


In [None]:
def conv_bn(in_channels, out_channels, conv, *args, **kwargs):
    return nn.Sequential(conv(in_channels, out_channels, *args, **kwargs), nn.BatchNorm2d(out_channels))


In [None]:
class ResNetBasicBlock(ResNetResidualBlock):
    """
    Basic ResNet block composed by two layers of 3x3conv/batchnorm/activation
    """
    expansion = 1
    def __init__(self, in_channels, out_channels, *args, **kwargs):
        super().__init__(in_channels, out_channels, *args, **kwargs)
        self.blocks = nn.Sequential(
            conv_bn(self.in_channels, self.out_channels, conv=self.conv, bias=False, stride=self.downsampling),
            activation_func(self.activation),
            conv_bn(self.out_channels, self.expanded_channels, conv=self.conv, bias=False),
        )

In [None]:
class ResNetBottleNeckBlock(ResNetResidualBlock):
    expansion = 4
    def __init__(self, in_channels, out_channels, *args, **kwargs):
        super().__init__(in_channels, out_channels, expansion=4, *args, **kwargs)
        self.blocks = nn.Sequential(
           conv_bn(self.in_channels, self.out_channels, self.conv, kernel_size=1),
             activation_func(self.activation),
             conv_bn(self.out_channels, self.out_channels, self.conv, kernel_size=3, stride=self.downsampling),
             activation_func(self.activation),
             conv_bn(self.out_channels, self.expanded_channels, self.conv, kernel_size=1),
        )

In [None]:
class ResNetLayer(nn.Module):
    """
    A ResNet layer composed by `n` blocks stacked one after the other
    """
    def __init__(self, in_channels, out_channels, block=ResNetBasicBlock, n=1, *args, **kwargs):
        super().__init__()
        # 'We perform downsampling directly by convolutional layers that have a stride of 2.'
        downsampling = 2 if in_channels != out_channels else 1
        self.blocks = nn.Sequential(
            block(in_channels , out_channels, *args, **kwargs, downsampling=downsampling),
            *[block(out_channels * block.expansion, 
                    out_channels, downsampling=1, *args, **kwargs) for _ in range(n - 1)]
        )

    def forward(self, x):
        x = self.blocks(x)
        return x

In [None]:
class ResNetEncoder(nn.Module):
    """
    ResNet encoder composed by layers with increasing features.
    """
    def __init__(self, in_channels=3, blocks_sizes=[64, 128, 256, 512], deepths=[2,2,2,2], 
                 activation='relu', block=ResNetBasicBlock, *args, **kwargs):
        super().__init__()
        self.blocks_sizes = blocks_sizes
        
        self.gate = nn.Sequential(
            nn.Conv2d(in_channels, self.blocks_sizes[0], kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(self.blocks_sizes[0]),
            activation_func(activation),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        self.in_out_block_sizes = list(zip(blocks_sizes, blocks_sizes[1:]))
        self.blocks = nn.ModuleList([ 
            ResNetLayer(blocks_sizes[0], blocks_sizes[0], n=deepths[0], activation=activation, 
                        block=block,*args, **kwargs),
            *[ResNetLayer(in_channels * block.expansion, 
                          out_channels, n=n, activation=activation, 
                          block=block, *args, **kwargs) 
              for (in_channels, out_channels), n in zip(self.in_out_block_sizes, deepths[1:])]       
        ])
        
        
    def forward(self, x):
        x = self.gate(x)
        for block in self.blocks:
            x = block(x)
        return x

In [None]:
class ResnetDecoder(nn.Module):
    """
    This class represents the tail of ResNet. It performs a global pooling and maps the output to the
    correct class by using a fully connected layer.
    """
    def __init__(self, in_features, n_classes):
        super().__init__()
        self.avg = nn.AdaptiveAvgPool2d((1, 1))
        self.decoder = nn.Linear(in_features, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.avg(x)
        x = x.view(x.size(0), -1)
        x = self.decoder(x)
        x = self.sigmoid(x)
        return x


In [None]:
class ResNet(nn.Module):
    
    def __init__(self, in_channels, n_classes, *args, **kwargs):
        super().__init__()
        self.encoder = ResNetEncoder(in_channels, *args, **kwargs)
        self.decoder = ResnetDecoder(self.encoder.blocks[-1].blocks[-1].expanded_channels, n_classes)
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
def resnet152(in_channels, n_classes, block=ResNetBottleNeckBlock, *args, **kwargs):
    return ResNet(in_channels, n_classes, block=block, deepths=[3, 8, 36, 3], *args, **kwargs)

In [None]:
model = resnet152(2, 1, activation='sigmoid')
print(model.cuda(), (3, 224, 224))

In [None]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, resnet_architecture, embedding_func):
        super(FakeNewsClassifier, self).__init__()
        self.resnet=resnet_architecture
    def forward(self, x):
        output = self.resnet(x)
        return output

In [None]:
model = FakeNewsClassifier(resnet152(128, 1, activation='sigmoid'), BertEmbedding)

In [None]:
def getIds(inp):
    ids=[]
    masks=[]
    for sent in inp:
        input_token = ['[CLS]'] + bert_tokenizer.tokenize(sent) + ['[SEP]']
        input_id = bert_tokenizer.convert_tokens_to_ids(input_token)
        input_id=[input_id]
        input_ids = pad_sequences(input_id, maxlen=128, dtype="long", truncating="post", padding="post")
        attention_masks = []
        for seq in input_ids:
            for i in seq:
                if i > 0:
                    attention_masks.append(1)
                else:
                    attention_masks.append(0)
        maks=[attention_masks]
        masks.append(maks)
        ids.append(input_ids)
    return(ids,masks)

In [None]:
def getIds(inp):
        ids=[]
        masks=[]
        for sent in inp:
            input_token = ['[CLS]'] + bert_tokenizer.tokenize(sent) + ['[SEP]']
            input_id = bert_tokenizer.convert_tokens_to_ids(input_token)
            input_id=[input_id]
            input_ids = pad_sequences(input_id, maxlen=128, dtype="long", truncating="post", padding="post")
            attention_masks = []
            for seq in input_ids:
                for i in seq:
                    if i > 0:
                        attention_masks.append(1)
                    else:
                        attention_masks.append(0)
            maks=[attention_masks]
            masks.append(maks)
            ids.append(input_ids)
        return(ids,masks)

In [None]:
class DataGenerator:
    def __init__(self,data):
        MAX_LEN = 128
        batch_size=40
        data['split'].value_counts()
        trainDf=data[data['split']=='TRAIN']
        valDf=data[data['split']=='VALID']
        testDf=data[data['split']=='TEST']
        print(trainDf.head())
        print(valDf.head())
        print(testDf.head())

        
        trainText = trainDf['text'].values
        trainLabels = trainDf['truth'].values
        valText = valDf['text'].values
        valLabels = valDf['truth'].values
        testText = testDf['text'].values
        testLabels = testDf['truth'].values


        train_ids, train_attention_masks = getIds(trainText)
        val_ids, val_attention_masks = getIds(valText)
        test_ids, test_attention_masks = getIds(testText)

        train_data_inputs = torch.tensor(train_ids)
        train_data_labels = torch.tensor(trainLabels)
        train_data_masks = torch.tensor(train_attention_masks)
        val_data_inputs = torch.tensor(val_ids)
        val_data_labels = torch.tensor(valLabels)
        val_data_masks = torch.tensor(val_attention_masks)
        test_data_inputs = torch.tensor(test_ids)
        test_data_labels = torch.tensor(testLabels)
        test_data_masks = torch.tensor(test_attention_masks)

        trainDataset = TensorDataset(train_data_inputs, train_data_masks, train_data_labels)
        testDataset = TensorDataset(test_data_inputs, test_data_masks, test_data_labels)
        valDataset = TensorDataset(val_data_inputs, val_data_masks, val_data_labels)
        train_data_sampler=RandomSampler(trainDataset)
        test_data_sampler=RandomSampler(testDataset)
        val_data_sampler=RandomSampler(valDataset)

        self.train_dataloader = DataLoader(trainDataset, sampler=train_data_sampler, batch_size=batch_size)
        self.valid_dataloader = DataLoader(valDataset, sampler=test_data_sampler, batch_size=batch_size)
        self.test_dataloader = DataLoader(testDataset,sampler=test_data_sampler, batch_size=batch_size)

        
    def generate_train_batch_data(self):
        for i, batch in enumerate(self.train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch

    def generate_valid_batch_data(self):
        for i, batch in enumerate(self.valid_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch

    def generate_test_batch_data(self):
        for i, batch in enumerate(self.test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch

In [None]:
class DataGeneratorv2:
    def __init__(self):
        MAX_LEN = 128
        batch_size=40
        data=pd.read_csv('/home/gridsan/svattam/upama/FakeNews/combined_relevant_data.csv')
        df = data.sample(frac=1).reset_index(drop=True)
        data=df.iloc[0:1000]
        text=data[['text']]
        labels=data[['truth']].to_numpy()
        tokenizer= BertTokenizer.from_pretrained('/home/gridsan/svattam/bert/bert-base-cased', do_lower_case=True)
        x_train, X_test, Y_train, y_test = train_test_split(text,labels, test_size=0.3)
        x_val, x_test, Y_val, Y_test = train_test_split(X_test, y_test, test_size=0.5)
        
        train_inputs, train_attention_masks = prepare_for_bert(x_train, tokenizer, MAX_LEN)
        val_inputs, val_attention_masks = prepare_for_bert(x_val, tokenizer, MAX_LEN)
        test_inputs, test_attention_masks = prepare_for_bert(x_test, tokenizer, MAX_LEN)
        
        train_data_inputs = torch.tensor(train_inputs)
        train_data_labels = torch.tensor(Y_train)
        train_data_masks = torch.tensor(train_attention_masks)
        val_data_inputs = torch.tensor(val_inputs)
        val_data_labels = torch.tensor(Y_val)
        val_data_masks = torch.tensor(val_attention_masks)
        test_data_inputs = torch.tensor(test_inputs)
        test_data_labels = torch.tensor(Y_test)
        test_data_masks = torch.tensor(test_attention_masks)
       
        trainDataset = TensorDataset(train_data_inputs, train_data_masks, train_data_labels)
        testDataset = TensorDataset(test_data_inputs, test_data_masks, test_data_labels)
        valDataset = TensorDataset(val_data_inputs, val_data_masks, val_data_labels)
        train_data_sampler=RandomSampler(trainDataset)
        test_data_sampler=RandomSampler(testDataset)
        val_data_sampler=RandomSampler(valDataset)


        # self.train_data = 
        # self.train_masks = 
        # self.train_labels = 

        # self.valid_data = 
        # self.valid_masks = 
        # self.valid_labels = 

        self.train_dataloader = DataLoader(trainDataset, sampler=train_data_sampler, batch_size=batch_size)
        self.valid_dataloader = DataLoader(valDataset, sampler=test_data_sampler, batch_size=batch_size)
        self.test_dataloader = DataLoader(testDataset,sampler=test_data_sampler, batch_size=batch_size)

    # def sampleFromClass(ds, k):
    #     class_counts = {}
    #     train_data = []
    #     train_label = []
    #     test_data = []
    #     test_label = []
    #     for data, label in ds:
    #         c = label.item()
    #         class_counts[c] = class_counts.get(c, 0) + 1
    #         if class_counts[c] <= k:
    #             train_data.append(data)
    #             train_label.append(torch.unsqueeze(label, 0))
    #         else:
    #             test_data.append(data)
    #             test_label.append(torch.unsqueeze(label, 0))
    #     train_data = torch.cat(train_data)
    #     for ll in train_label:
    #         print(ll)
    #     train_label = torch.cat(train_label)
    #     test_data = torch.cat(test_data)
    #     test_label = torch.cat(test_label)

    #     return (TensorDataset(train_data, train_label), 
    #         TensorDataset(test_data, test_label))
        
    def generate_train_batch_data(self):
        for i, batch in enumerate(self.train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch

    def generate_valid_batch_data(self):
        for i, batch in enumerate(self.valid_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch

    def generate_test_batch_data(self):
        for i, batch in enumerate(self.test_dataloader):
            batch = tuple(t.to(device) for t in batch)
            yield batch


In [None]:
data=pd.read_csv('/home/gridsan/svattam/upama/FakeNews/combined_relevant_data.csv')

In [None]:
df = data.sample(frac=1).reset_index(drop=True)


In [None]:
sample_data=df.iloc[0:100]

In [None]:
def prepare_for_bert(df, tokenizer, MAX_LEN):
    sentences=df['text'].values
    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return input_ids, attention_masks

In [None]:
data_gen=DataGeneratorv2()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
class Embedder:
    def __init__(self):
        self.embedding_model = BertModel.from_pretrained('/home/gridsan/svattam/bert/bert-base-cased', from_tf=False)
        self.embedding_model.cuda()

In [None]:
embedder = Embedder()


In [None]:
model.cuda()

In [None]:
n_epochs = 10
batch_size = 40
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()
metrics=[]
for epoch in range(n_epochs):
    start_time = time.time()
    train_loss = []
    val_loss=[]
    train_acc=[]
    val_acc=[]
    for i, batch in enumerate(data_gen.generate_train_batch_data()):
        print(i)
        model.train(True)
        b_input_ids, b_input_mask, labels = batch
        print(b_input_ids.size(),b_input_mask.size())
        with torch.no_grad():
            embeddings = embedder.embedding_model(b_input_ids, attention_mask=b_input_mask)
            embeddings=torch.stack(embeddings[0], dim=3)
        X_batch = embeddings.float()
#         X_batch = X_batch.unsqueeze(3)
        y_batch = labels.float()
        y_pred = model(X_batch)
        print("=================")
        print(y_batch)
        print(y_pred)
        print("=================")
        exit()
        optimizer.zero_grad()
        loss = loss_fn(y_pred, y_batch)
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        y_labels=y_pred.cpu().detach().numpy()
        cat=np.argmax(y_labels, axis=1)
        clabels=np.argmax(labels.cpu().detach().numpy(), axis=1)
        train_acc.append(accuracy_score(clabels,cat))
    for i, batch in enumerate(data_gen.generate_valid_batch_data()):
        print(i)
        correct=0
        model.eval()
        b_input_ids, b_input_mask, labels = batch
        with torch.no_grad():
            embeddings = embedder.embedding_model(b_input_ids, attention_mask=b_input_mask)
            embeddings=torch.stack(embeddings[0], dim=3)
        X_batch = embeddings.float()
#         X_batch = embeddings[0].float()
        y_batch = labels.float()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        val_loss.append(loss.item())
        y_labels=y_pred.cpu().detach().numpy()
        cat=np.argmax(y_labels, axis=1)
        clabels=np.argmax(labels.cpu().detach().numpy(), axis=1)
        val_acc.append(accuracy_score(clabels,cat))   
    trainingAccuracy=statistics.mean(train_acc)
    validationAccuracy=statistics.mean(val_acc)
    trainingLoss=statistics.mean(train_loss)
    validationLoss=statistics.mean(val_loss)
    metrics.append([trainingAccuracy,trainingLoss,validationAccuracy,validationLoss])
print(metrics)
stats = np.array(metrics)
np.savez("trainingstats.npz", stats)


In [None]:
torch.save(model,PATH)