In [None]:
!pip install transformers
!pip install captum

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torchtext
from torchtext.vocab import GloVe
from torchtext.vocab import vocab
from torch.utils.data import random_split
import torch.nn.functional as F
from collections import Counter, OrderedDict
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import re
from captum.attr import IntegratedGradients, LayerIntegratedGradients

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

Device:  cuda


## Load the Data

In [None]:
fake_news_df = pd.read_csv('Fake.csv', error_bad_lines=False)
fake_news_df = fake_news_df.drop(['subject', 'date'], axis=1)
fake_news_df['label'] = 0



  fake_news_df = pd.read_csv('Fake.csv', error_bad_lines=False)


In [None]:
real_news_df = pd.read_csv('True.csv', error_bad_lines=False)
real_news_df = real_news_df.drop(['subject', 'date'], axis=1)
real_news_df['label'] = 1



  real_news_df = pd.read_csv('True.csv', error_bad_lines=False)


In [None]:
## For Fake.csv vs. True.csv
# all_data_df = pd.concat([fake_news_df, real_news_df], axis=0)
## For news_articles.csv
all_data_df = pd.read_csv('news_articles.csv', error_bad_lines=False)
all_data_df = all_data_df.drop(['author', 'published', 'language', 'site_url', 'main_img_url', 'type', 'title_without_stopwords', 'text_without_stopwords', 'hasImage'], axis=1)
all_data_df.dropna(axis=0, how='any', inplace=True)
all_data_df = all_data_df.replace({'Real': 1, 'Fake': 0})
non_string_rows = all_data_df['text'].apply(lambda x: not isinstance(x, str))
all_data_df = all_data_df.drop(index=all_data_df[non_string_rows].index)

## For both
fake_count = (all_data_df['label'] == 0).sum()
real_count = (all_data_df['label'] == 1).sum()
print("Fake articles: ", fake_count)
print("Real articles: ", real_count)
all_data_df

Fake articles:  1292
Real articles:  756




  all_data_df = pd.read_csv('news_articles.csv', error_bad_lines=False)


Unnamed: 0,title,text,label
0,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,1
1,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,1
2,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,1
3,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,1
4,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,1
...,...,...,...
2045,trump vs clinton a risk vs a disaster,check out hillarythemed haunted house anticlin...,1
2046,gingrich slutshames megyn kelly,good samaritan wearing indian headdress disarm...,1
2047,youtube bans clintons black son,skype sex scam a fortune built on shame moroc...,1
2048,wikileaks bombshells on hillary you need to know,posted by eddie while the skyhigh potency may ...,1


## Build the Vocab

In [None]:
def clean_text(text):
    # This regular expression pattern will match the unnecessary text at the beginning of the text column
    pattern = r'^\s*[\w/,]*\s*\((Reuters|AP|AFP|CNN)\)\s*-\s*'
    cleaned_text = re.sub(pattern, '', text)
    # This regular expression pattern will match every instance of "Reuters" in the text, regardless of capitalization
    pattern = re.compile(r'reuters', re.IGNORECASE)
    cleaned_text = re.sub(pattern, '', cleaned_text)
    return cleaned_text

In [None]:
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
all_data_df['title'] = all_data_df['title'].apply(lambda x: clean_text(x))
all_data_df['text'] = all_data_df['text'].apply(lambda x: clean_text(x))
all_data_df['title_tokens'] = all_data_df['title'].apply(tokenizer)
all_data_df['text_tokens'] = all_data_df['text'].apply(tokenizer)

In [None]:
tokenized_data = all_data_df['title_tokens'].tolist() + all_data_df['text_tokens'].tolist()
token_counter = Counter(token for article_tokens in tokenized_data for token in article_tokens)
sorted_by_freq_tuples = sorted(token_counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
built_vocab = vocab(ordered_dict)
word_to_index = built_vocab.get_stoi()
index_to_word = built_vocab.get_itos()
print("Vocab Size: ", len(built_vocab))

Vocab Size:  47740


In [None]:
all_data_df['title_token_indices'] = all_data_df['title_tokens'].apply(lambda tokens: torch.LongTensor([word_to_index[token] for token in tokens]).to(device))
all_data_df['text_token_indices'] = all_data_df['text_tokens'].apply(lambda tokens: torch.LongTensor([word_to_index[token] for token in tokens]).to(device))
all_data_df

Unnamed: 0,title,text,label,title_tokens,text_tokens,title_token_indices,text_token_indices
0,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,1,"[muslims, busted, they, stole, millions, in, g...","[print, they, should, pay, all, the, back, all...","[tensor(486, device='cuda:0'), tensor(5488, de...","[tensor(1090, device='cuda:0'), tensor(23, dev..."
1,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,1,"[re, why, did, attorney, general, loretta, lyn...","[why, did, attorney, general, loretta, lynch, ...","[tensor(3391, device='cuda:0'), tensor(136, de...","[tensor(136, device='cuda:0'), tensor(130, dev..."
2,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,1,"[breaking, weiner, cooperating, with, fbi, on,...","[red, state, fox, news, sunday, reported, this...","[tensor(586, device='cuda:0'), tensor(1624, de...","[tensor(680, device='cuda:0'), tensor(85, devi..."
3,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,1,"[pin, drop, speech, by, father, of, daughter, ...","[email, kayla, mueller, was, a, prisoner, and,...","[tensor(9858, device='cuda:0'), tensor(2929, d...","[tensor(115, device='cuda:0'), tensor(26908, d..."
4,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,1,"[fantastic, trumps, point, plan, to, reform, h...","[email, healthcare, reform, to, make, america,...","[tensor(5233, device='cuda:0'), tensor(169, de...","[tensor(115, device='cuda:0'), tensor(2157, de..."
...,...,...,...,...,...,...,...
2045,trump vs clinton a risk vs a disaster,check out hillarythemed haunted house anticlin...,1,"[trump, vs, clinton, a, risk, vs, a, disaster]","[check, out, hillarythemed, haunted, house, an...","[tensor(28, device='cuda:0'), tensor(2814, dev...","[tensor(1012, device='cuda:0'), tensor(56, dev..."
2046,gingrich slutshames megyn kelly,good samaritan wearing indian headdress disarm...,1,"[gingrich, slutshames, megyn, kelly]","[good, samaritan, wearing, indian, headdress, ...","[tensor(2387, device='cuda:0'), tensor(13929, ...","[tensor(165, device='cuda:0'), tensor(47704, d..."
2047,youtube bans clintons black son,skype sex scam a fortune built on shame moroc...,1,"[youtube, bans, clintons, black, son]","[skype, sex, scam, a, fortune, built, on, sham...","[tensor(819, device='cuda:0'), tensor(5998, de...","[tensor(47715, device='cuda:0'), tensor(581, d..."
2048,wikileaks bombshells on hillary you need to know,posted by eddie while the skyhigh potency may ...,1,"[wikileaks, bombshells, on, hillary, you, need...","[posted, by, eddie, while, the, skyhigh, poten...","[tensor(337, device='cuda:0'), tensor(13930, d...","[tensor(644, device='cuda:0'), tensor(15, devi..."


## Build the Dataset

In [None]:
class NewsDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        raw_title = self.data.iloc[index]['title']
        raw_text = self.data.iloc[index]['text']
        title_tokens = self.data.iloc[index]['title_token_indices']
        text_tokens = self.data.iloc[index]['text_token_indices']
        label = self.data.iloc[index]['label']
        return raw_title, title_tokens, raw_text, text_tokens, label

In [None]:
class CollateBatch:
    def __init__(self, padding_idx=0):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        raw_title, title_tokens, raw_text, text_tokens, label = zip(*batch)
        padded_title_tokens = torch.nn.utils.rnn.pad_sequence(title_tokens, batch_first=True, padding_value=self.padding_idx).to(device)
        padded_text_tokens = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=self.padding_idx).to(device)
        label = torch.LongTensor(label)
        return raw_title, padded_title_tokens, raw_text, padded_text_tokens, label

In [None]:
dataset = NewsDataset(all_data_df)
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

## Setup Model Training and Evaluation

In [None]:
batch_size = 32
collate_fn = CollateBatch()
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [None]:
def eval_model(model, data, loss_fn, use_raw_data=False):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    one_predictions = 0
    zero_predictions = 0

    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(data), total=len(data)):
            if use_raw_data:
              titles, _, text, _, labels = batch
            else:
              _, titles, _, text, labels = batch
              titles = titles.to(device)
              text = text.to(device)
            labels = labels.to(device)

            outs = model.forward(title=titles, text=text).to(device)
            predictions = torch.argmax(outs, dim=-1).to(device)
            one_predictions += torch.sum(predictions == 1)
            zero_predictions += torch.sum(predictions == 0)
            loss = loss_fn(outs, labels)
            total_loss += loss.item()
            correct_predictions += torch.sum(predictions == labels)

    print("One: ", one_predictions)
    print("Zero: ", zero_predictions)
    epoch_loss = total_loss / len(data)
    accuracy = correct_predictions / len(data.dataset)
    return epoch_loss, accuracy

def train_epoch(model, data, loss_fn, optimizer, use_raw_data=False):
    total_loss = 0
    correct_predictions = 0
    model.train()
    for batch_idx, batch in tqdm(enumerate(data), total=len(data)):
        if use_raw_data:
          titles, _, text, _, labels = batch
        else:
          _, titles, _, text, labels = batch
          titles = titles.to(device)
          text = text.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outs = model.forward(title=titles, text=text).to(device)
        predictions = torch.argmax(outs, dim=-1).to(device)
        loss = loss_fn(outs, labels)
        total_loss += loss.item()
        correct_predictions += torch.sum(predictions == labels)
        loss.backward()
        optimizer.step()

    epoch_loss = total_loss / len(data)
    accuracy = correct_predictions / len(data.dataset)
    return epoch_loss, accuracy

def train_model(model, num_epochs, train_data, val_data, loss_fn, optimizer, use_raw_data=False):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss, train_acc = train_epoch(model, train_data, loss_fn, optimizer, use_raw_data)
        print(f"Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}")

        val_loss, val_acc = eval_model(model, val_data, loss_fn, use_raw_data)
        print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}")

In [None]:
def evaluate_model_features(model, path_to_saved_model_weights, use_raw_data=False):
    model.load_state_dict(torch.load(path_to_saved_model_weights))
    ig = LayerIntegratedGradients(model, model.embedding_layer)
    score_dict = dict()
    for batch_idx, batch in tqdm(enumerate(test_loader), total=len(test_loader)):
      if use_raw_data:
        return None, None
      else:
        _, title_indices, _, text_indices, _ = batch
        title_indices = title_indices.to(device)
        text_indices = text_indices.to(device)
      input_indices = (title_indices, text_indices)
      attr_scores = ig.attribute(inputs=input_indices, target=1, internal_batch_size=32)
      word_scores = torch.abs(attr_scores.sum(axis=2))
      word_scores /= word_scores.sum()
      word_scores_list = word_scores.tolist()[0]
      input_text_list = all_data_df.iloc[0]['text_tokens']
      attribution_dict = dict(zip(input_text_list, word_scores_list))
      for word, score in attribution_dict.items():
        if word not in score_dict:
          score_dict[word] = 0
        score_dict[word] += score
    for word, score in score_dict.items():
      score_dict[word] = score_dict[word] / len(test_loader.dataset)
    top_true_features = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    top_true_features = top_true_features[:10]
    top_false_features = sorted(score_dict.items(), key=lambda x: x[1], reverse=False)
    top_false_features = top_false_features[:10]
    return top_true_features, top_false_features

## Build, Train, and Test NewsPretrainedBertModel

In [None]:
class NewsPretrainedBertModel(nn.Module):
    def __init__(self):
        super(NewsPretrainedBertModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
        self.max_len = 512
        self.bert = BertModel.from_pretrained("prajjwal1/bert-tiny")
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, title, text):
        title_and_text = [a + ' ' + b for a, b in zip(title, text)]
        encoding = self.tokenizer.batch_encode_plus(
            title_and_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1].to(device)
        output = self.dropout(embeddings).to(device)
        output = self.linear(output).to(device)
        return output

In [None]:
model = NewsPretrainedBertModel().to(device)
print(model)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
train_model(model, 18, train_loader, val_loader, loss_fn, optimizer, use_raw_data=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

NewsPretrainedBertModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, element

100%|██████████| 45/45 [00:26<00:00,  1.68it/s]


Train loss: 0.6678, Train accuracy: 0.5897


100%|██████████| 10/10 [00:03<00:00,  2.61it/s]


One:  tensor(137, device='cuda:0')
Zero:  tensor(170, device='cuda:0')
Validation loss: 0.6261, Validation accuracy: 0.5896
Epoch 2/18


100%|██████████| 45/45 [00:19<00:00,  2.36it/s]


Train loss: 0.6332, Train accuracy: 0.6183


100%|██████████| 10/10 [00:05<00:00,  1.75it/s]


One:  tensor(205, device='cuda:0')
Zero:  tensor(102, device='cuda:0')
Validation loss: 0.6457, Validation accuracy: 0.5244
Epoch 3/18


100%|██████████| 45/45 [00:19<00:00,  2.34it/s]


Train loss: 0.5875, Train accuracy: 0.6664


100%|██████████| 10/10 [00:04<00:00,  2.07it/s]


One:  tensor(185, device='cuda:0')
Zero:  tensor(122, device='cuda:0')
Validation loss: 0.6365, Validation accuracy: 0.5831
Epoch 4/18


100%|██████████| 45/45 [00:23<00:00,  1.94it/s]


Train loss: 0.5622, Train accuracy: 0.6895


100%|██████████| 10/10 [00:04<00:00,  2.11it/s]


One:  tensor(137, device='cuda:0')
Zero:  tensor(170, device='cuda:0')
Validation loss: 0.6082, Validation accuracy: 0.6091
Epoch 5/18


100%|██████████| 45/45 [00:19<00:00,  2.26it/s]


Train loss: 0.5324, Train accuracy: 0.7097


100%|██████████| 10/10 [00:03<00:00,  2.59it/s]


One:  tensor(148, device='cuda:0')
Zero:  tensor(159, device='cuda:0')
Validation loss: 0.6126, Validation accuracy: 0.6319
Epoch 6/18


100%|██████████| 45/45 [00:21<00:00,  2.07it/s]


Train loss: 0.4877, Train accuracy: 0.7676


100%|██████████| 10/10 [00:03<00:00,  2.59it/s]


One:  tensor(125, device='cuda:0')
Zero:  tensor(182, device='cuda:0')
Validation loss: 0.6087, Validation accuracy: 0.6743
Epoch 7/18


100%|██████████| 45/45 [00:19<00:00,  2.34it/s]


Train loss: 0.4557, Train accuracy: 0.7760


100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


One:  tensor(178, device='cuda:0')
Zero:  tensor(129, device='cuda:0')
Validation loss: 0.6691, Validation accuracy: 0.6059
Epoch 8/18


100%|██████████| 45/45 [00:18<00:00,  2.37it/s]


Train loss: 0.4488, Train accuracy: 0.7858


100%|██████████| 10/10 [00:04<00:00,  2.28it/s]


One:  tensor(112, device='cuda:0')
Zero:  tensor(195, device='cuda:0')
Validation loss: 0.6382, Validation accuracy: 0.6840
Epoch 9/18


100%|██████████| 45/45 [00:20<00:00,  2.20it/s]


Train loss: 0.4148, Train accuracy: 0.8158


100%|██████████| 10/10 [00:03<00:00,  2.56it/s]


One:  tensor(129, device='cuda:0')
Zero:  tensor(178, device='cuda:0')
Validation loss: 0.6352, Validation accuracy: 0.6547
Epoch 10/18


100%|██████████| 45/45 [00:20<00:00,  2.17it/s]


Train loss: 0.3711, Train accuracy: 0.8283


100%|██████████| 10/10 [00:03<00:00,  2.60it/s]


One:  tensor(99, device='cuda:0')
Zero:  tensor(208, device='cuda:0')
Validation loss: 0.6722, Validation accuracy: 0.7134
Epoch 11/18


100%|██████████| 45/45 [00:19<00:00,  2.35it/s]


Train loss: 0.3614, Train accuracy: 0.8367


100%|██████████| 10/10 [00:05<00:00,  1.77it/s]


One:  tensor(132, device='cuda:0')
Zero:  tensor(175, device='cuda:0')
Validation loss: 0.6321, Validation accuracy: 0.6971
Epoch 12/18


100%|██████████| 45/45 [00:19<00:00,  2.26it/s]


Train loss: 0.3503, Train accuracy: 0.8346


100%|██████████| 10/10 [00:04<00:00,  2.43it/s]


One:  tensor(71, device='cuda:0')
Zero:  tensor(236, device='cuda:0')
Validation loss: 0.6494, Validation accuracy: 0.7264
Epoch 13/18


100%|██████████| 45/45 [00:20<00:00,  2.18it/s]


Train loss: 0.2913, Train accuracy: 0.8744


100%|██████████| 10/10 [00:03<00:00,  2.59it/s]


One:  tensor(122, device='cuda:0')
Zero:  tensor(185, device='cuda:0')
Validation loss: 0.7862, Validation accuracy: 0.6971
Epoch 14/18


100%|██████████| 45/45 [00:20<00:00,  2.17it/s]


Train loss: 0.2800, Train accuracy: 0.8814


100%|██████████| 10/10 [00:03<00:00,  2.59it/s]


One:  tensor(100, device='cuda:0')
Zero:  tensor(207, device='cuda:0')
Validation loss: 0.7558, Validation accuracy: 0.7231
Epoch 15/18


100%|██████████| 45/45 [00:19<00:00,  2.37it/s]


Train loss: 0.2478, Train accuracy: 0.8939


100%|██████████| 10/10 [00:05<00:00,  1.75it/s]


One:  tensor(120, device='cuda:0')
Zero:  tensor(187, device='cuda:0')
Validation loss: 0.8024, Validation accuracy: 0.7231
Epoch 16/18


100%|██████████| 45/45 [00:21<00:00,  2.11it/s]


Train loss: 0.2412, Train accuracy: 0.8981


100%|██████████| 10/10 [00:05<00:00,  1.76it/s]


One:  tensor(84, device='cuda:0')
Zero:  tensor(223, device='cuda:0')
Validation loss: 0.7639, Validation accuracy: 0.7101
Epoch 17/18


100%|██████████| 45/45 [00:18<00:00,  2.37it/s]


Train loss: 0.2200, Train accuracy: 0.9093


100%|██████████| 10/10 [00:03<00:00,  2.58it/s]


One:  tensor(102, device='cuda:0')
Zero:  tensor(205, device='cuda:0')
Validation loss: 0.8101, Validation accuracy: 0.7296
Epoch 18/18


100%|██████████| 45/45 [00:20<00:00,  2.16it/s]


Train loss: 0.2174, Train accuracy: 0.9142


100%|██████████| 10/10 [00:04<00:00,  2.19it/s]

One:  tensor(94, device='cuda:0')
Zero:  tensor(213, device='cuda:0')
Validation loss: 0.7328, Validation accuracy: 0.7166





In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn, use_raw_data=True)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

100%|██████████| 10/10 [00:05<00:00,  2.00it/s]

One:  tensor(100, device='cuda:0')
Zero:  tensor(208, device='cuda:0')
Test loss: 0.6674, Test accuracy: 0.7045





## Build, Train, and Test NewsCNNModel

In [None]:
class NewsCNNModel(nn.Module):
    def __init__(self, num_filters, filter_sizes, dropout, output_dim=2, embedding_dim=300):
        super(NewsCNNModel, self).__init__()
        vocab_size = len(built_vocab)
        embedding_weights = self.get_embedding_weights(vocab_size, embedding_dim)
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.embedding_layer.weight.data.copy_(embedding_weights)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_dim))
            for filter_size in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.prediction_layer = nn.Linear(num_filters * len(filter_sizes), output_dim)

    def get_embedding_weights(self, vocab_size, embedding_dim):
        glove_embeddings = GloVe(name='6B', dim=embedding_dim)
        embedding_weights = []
        for i in range(vocab_size):
            word = index_to_word[i]
            if word in glove_embeddings.stoi:
                embedding_weights.append(glove_embeddings.vectors[glove_embeddings.stoi[word]])
            else:
                embedding_weights.append(torch.randn(embedding_dim))
        embedding_weights = torch.stack(embedding_weights)
        return embedding_weights

    def forward(self, title, text):
        title_embedded = self.embedding_layer(title.long())
        text_embedded = self.embedding_layer(text.long())
        embedding = torch.cat((title_embedded, text_embedded), dim=1)
        embedding = embedding.unsqueeze(1)
        conv_results = [F.relu(conv(embedding)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_results]
        cat = torch.cat(pooled_results, dim=-1)
        out = self.dropout(cat)
        out = self.prediction_layer(cat)
        return out


In [None]:
NUM_FILTERS = 10
FILTER_SIZES = [3, 4, 5]
DROPOUT = 0.6
model = NewsCNNModel(num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES, dropout=DROPOUT, embedding_dim=100).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
train_model(model, 10, train_loader, val_loader, loss_fn, optimizer)

.vector_cache/glove.6B.zip: 862MB [02:50, 5.07MB/s]                           
100%|█████████▉| 399999/400000 [00:20<00:00, 19505.78it/s]


Epoch 1/10


100%|██████████| 45/45 [00:07<00:00,  5.74it/s]


Train loss: 0.6822, Train accuracy: 0.5736


100%|██████████| 10/10 [00:00<00:00, 43.78it/s]


One:  tensor(14, device='cuda:0')
Zero:  tensor(293, device='cuda:0')
Validation loss: 0.6468, Validation accuracy: 0.6645
Epoch 2/10


100%|██████████| 45/45 [00:03<00:00, 14.34it/s]


Train loss: 0.6097, Train accuracy: 0.6455


100%|██████████| 10/10 [00:00<00:00, 40.71it/s]


One:  tensor(12, device='cuda:0')
Zero:  tensor(295, device='cuda:0')
Validation loss: 0.6288, Validation accuracy: 0.6840
Epoch 3/10


100%|██████████| 45/45 [00:03<00:00, 13.25it/s]


Train loss: 0.5613, Train accuracy: 0.6930


100%|██████████| 10/10 [00:00<00:00, 30.00it/s]


One:  tensor(34, device='cuda:0')
Zero:  tensor(273, device='cuda:0')
Validation loss: 0.6144, Validation accuracy: 0.7101
Epoch 4/10


100%|██████████| 45/45 [00:03<00:00, 13.56it/s]


Train loss: 0.5100, Train accuracy: 0.7886


100%|██████████| 10/10 [00:00<00:00, 45.55it/s]


One:  tensor(52, device='cuda:0')
Zero:  tensor(255, device='cuda:0')
Validation loss: 0.5988, Validation accuracy: 0.7036
Epoch 5/10


100%|██████████| 45/45 [00:03<00:00, 14.33it/s]


Train loss: 0.4522, Train accuracy: 0.8646


100%|██████████| 10/10 [00:00<00:00, 45.39it/s]


One:  tensor(58, device='cuda:0')
Zero:  tensor(249, device='cuda:0')
Validation loss: 0.5808, Validation accuracy: 0.7101
Epoch 6/10


100%|██████████| 45/45 [00:03<00:00, 14.35it/s]


Train loss: 0.3891, Train accuracy: 0.9323


100%|██████████| 10/10 [00:00<00:00, 46.33it/s]


One:  tensor(62, device='cuda:0')
Zero:  tensor(245, device='cuda:0')
Validation loss: 0.5603, Validation accuracy: 0.7166
Epoch 7/10


100%|██████████| 45/45 [00:03<00:00, 12.97it/s]


Train loss: 0.3242, Train accuracy: 0.9637


100%|██████████| 10/10 [00:00<00:00, 29.83it/s]


One:  tensor(67, device='cuda:0')
Zero:  tensor(240, device='cuda:0')
Validation loss: 0.5423, Validation accuracy: 0.7134
Epoch 8/10


100%|██████████| 45/45 [00:03<00:00, 13.68it/s]


Train loss: 0.2620, Train accuracy: 0.9798


100%|██████████| 10/10 [00:00<00:00, 43.10it/s]


One:  tensor(70, device='cuda:0')
Zero:  tensor(237, device='cuda:0')
Validation loss: 0.5273, Validation accuracy: 0.7296
Epoch 9/10


100%|██████████| 45/45 [00:03<00:00, 14.07it/s]


Train loss: 0.2073, Train accuracy: 0.9902


100%|██████████| 10/10 [00:00<00:00, 42.31it/s]


One:  tensor(73, device='cuda:0')
Zero:  tensor(234, device='cuda:0')
Validation loss: 0.5181, Validation accuracy: 0.7329
Epoch 10/10


100%|██████████| 45/45 [00:03<00:00, 14.20it/s]


Train loss: 0.1617, Train accuracy: 0.9944


100%|██████████| 10/10 [00:00<00:00, 44.03it/s]

One:  tensor(74, device='cuda:0')
Zero:  tensor(233, device='cuda:0')
Validation loss: 0.5113, Validation accuracy: 0.7362





In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

100%|██████████| 10/10 [00:00<00:00, 44.67it/s]

One:  tensor(83, device='cuda:0')
Zero:  tensor(225, device='cuda:0')
Test loss: 0.4602, Test accuracy: 0.7922





In [None]:
torch.save(model.state_dict(), 'news_cnn_model.pth')
true_features, false_features = evaluate_model_features(model, 'news_cnn_model.pth')
print("\nTop True Features: ")
for word, score in true_features:
  print(str(word) + ': ' + str(score))

print("Top False Features: ")
for word, score in false_features:
  print(str(word) + ': ' + str(score))

100%|██████████| 10/10 [00:16<00:00,  1.68s/it]


Top True Features: 
stole: 4.527533589262679e-05
government: 1.7380693690668997e-05
from: 1.0525383772364673e-05
pay: 1.0035579295704103e-05
print: 7.3820487760564434e-06
four: 6.769105839194586e-06
did: 6.70608717099143e-06
one: 6.01180432688282e-06
cases: 5.5986239152507375e-06
why: 5.422659492654506e-06
Top False Features: 
entire: 0.0
to: 0.0
group: 0.0
everyone: 9.718516963871176e-08
asap: 1.1012509919290378e-07
family: 1.136343130579054e-07
bust: 1.334110453013289e-07
who: 2.016055994087126e-07
commit: 2.067044073150492e-07
weve: 3.2405505470427127e-07





## Build, Train, and Test NewsLSTMModel

In [None]:
class NewsLSTMModel(nn.Module):
    def __init__(self, hidden_dim, dropout, embedding_dim=300, output_dim=2):
        super(NewsLSTMModel, self).__init__()
        vocab_size = len(built_vocab)
        embedding_weights = self.get_embedding_weights(vocab_size, embedding_dim)
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.embedding_layer.weight.data.copy_(embedding_weights)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, output_dim)

    def get_embedding_weights(self, vocab_size, embedding_dim):
        glove_embeddings = GloVe(name='6B', dim=embedding_dim)
        embedding_weights = []
        for i in range(vocab_size):
            word = index_to_word[i]
            if word in glove_embeddings.stoi:
                embedding_weights.append(glove_embeddings.vectors[glove_embeddings.stoi[word]])
            else:
                embedding_weights.append(torch.randn(embedding_dim))
        embedding_weights = torch.stack(embedding_weights)
        return embedding_weights

    def forward(self, title, text):
        title_embedded = self.embedding_layer(title)
        text_embedded = self.embedding_layer(text)
        cat = torch.cat((title_embedded, text_embedded), dim=1)
        outs, _ = self.rnn(cat)
        outs = torch.max(outs, dim=1)[0]
        logits = self.fc1(outs)
        return logits


In [None]:
HIDDEN_DIM = 100
DROPOUT = 0.6
model = NewsLSTMModel(hidden_dim=HIDDEN_DIM, dropout=DROPOUT).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
train_model(model, 15, train_loader, val_loader, loss_fn, optimizer)

100%|█████████▉| 399999/400000 [00:57<00:00, 6908.42it/s]


Epoch 1/15


100%|██████████| 45/45 [00:05<00:00,  7.69it/s]


Train loss: 0.6467, Train accuracy: 0.6169


100%|██████████| 10/10 [00:00<00:00, 22.32it/s]


One:  tensor(26, device='cuda:0')
Zero:  tensor(281, device='cuda:0')
Validation loss: 0.6162, Validation accuracy: 0.7166
Epoch 2/15


100%|██████████| 45/45 [00:04<00:00,  9.76it/s]


Train loss: 0.5123, Train accuracy: 0.7753


100%|██████████| 10/10 [00:00<00:00, 22.17it/s]


One:  tensor(62, device='cuda:0')
Zero:  tensor(245, device='cuda:0')
Validation loss: 0.5573, Validation accuracy: 0.7622
Epoch 3/15


100%|██████████| 45/45 [00:04<00:00,  9.10it/s]


Train loss: 0.3427, Train accuracy: 0.9456


100%|██████████| 10/10 [00:00<00:00, 21.90it/s]


One:  tensor(160, device='cuda:0')
Zero:  tensor(147, device='cuda:0')
Validation loss: 0.5656, Validation accuracy: 0.6775
Epoch 4/15


100%|██████████| 45/45 [00:04<00:00,  9.69it/s]


Train loss: 0.1896, Train accuracy: 0.9846


100%|██████████| 10/10 [00:00<00:00, 22.39it/s]


One:  tensor(144, device='cuda:0')
Zero:  tensor(163, device='cuda:0')
Validation loss: 0.5404, Validation accuracy: 0.6971
Epoch 5/15


100%|██████████| 45/45 [00:04<00:00,  9.48it/s]


Train loss: 0.1010, Train accuracy: 0.9930


100%|██████████| 10/10 [00:00<00:00, 18.12it/s]


One:  tensor(74, device='cuda:0')
Zero:  tensor(233, device='cuda:0')
Validation loss: 0.4672, Validation accuracy: 0.7818
Epoch 6/15


100%|██████████| 45/45 [00:04<00:00,  9.29it/s]


Train loss: 0.0532, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 21.30it/s]


One:  tensor(61, device='cuda:0')
Zero:  tensor(246, device='cuda:0')
Validation loss: 0.4918, Validation accuracy: 0.7785
Epoch 7/15


100%|██████████| 45/45 [00:04<00:00,  9.61it/s]


Train loss: 0.0352, Train accuracy: 0.9979


100%|██████████| 10/10 [00:00<00:00, 21.68it/s]


One:  tensor(49, device='cuda:0')
Zero:  tensor(258, device='cuda:0')
Validation loss: 0.5453, Validation accuracy: 0.7850
Epoch 8/15


100%|██████████| 45/45 [00:05<00:00,  8.80it/s]


Train loss: 0.0321, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 14.73it/s]


One:  tensor(98, device='cuda:0')
Zero:  tensor(209, device='cuda:0')
Validation loss: 0.4917, Validation accuracy: 0.7818
Epoch 9/15


100%|██████████| 45/45 [00:05<00:00,  8.51it/s]


Train loss: 0.0186, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 17.15it/s]


One:  tensor(96, device='cuda:0')
Zero:  tensor(211, device='cuda:0')
Validation loss: 0.5060, Validation accuracy: 0.7687
Epoch 10/15


100%|██████████| 45/45 [00:05<00:00,  8.46it/s]


Train loss: 0.0126, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 16.14it/s]


One:  tensor(112, device='cuda:0')
Zero:  tensor(195, device='cuda:0')
Validation loss: 0.5279, Validation accuracy: 0.7687
Epoch 11/15


100%|██████████| 45/45 [00:05<00:00,  8.22it/s]


Train loss: 0.0111, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 15.23it/s]


One:  tensor(85, device='cuda:0')
Zero:  tensor(222, device='cuda:0')
Validation loss: 0.5194, Validation accuracy: 0.7850
Epoch 12/15


100%|██████████| 45/45 [00:04<00:00,  9.17it/s]


Train loss: 0.0095, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 22.18it/s]


One:  tensor(105, device='cuda:0')
Zero:  tensor(202, device='cuda:0')
Validation loss: 0.5460, Validation accuracy: 0.7785
Epoch 13/15


100%|██████████| 45/45 [00:04<00:00,  9.38it/s]


Train loss: 0.0076, Train accuracy: 0.9965


100%|██████████| 10/10 [00:00<00:00, 18.12it/s]


One:  tensor(81, device='cuda:0')
Zero:  tensor(226, device='cuda:0')
Validation loss: 0.5376, Validation accuracy: 0.7785
Epoch 14/15


100%|██████████| 45/45 [00:04<00:00,  9.02it/s]


Train loss: 0.0063, Train accuracy: 0.9972


100%|██████████| 10/10 [00:00<00:00, 21.76it/s]


One:  tensor(91, device='cuda:0')
Zero:  tensor(216, device='cuda:0')
Validation loss: 0.5714, Validation accuracy: 0.7785
Epoch 15/15


100%|██████████| 45/45 [00:04<00:00,  9.38it/s]


Train loss: 0.0070, Train accuracy: 0.9986


100%|██████████| 10/10 [00:00<00:00, 21.55it/s]

One:  tensor(83, device='cuda:0')
Zero:  tensor(224, device='cuda:0')
Validation loss: 0.5123, Validation accuracy: 0.8046





In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

100%|██████████| 10/10 [00:00<00:00, 24.01it/s]

One:  tensor(101, device='cuda:0')
Zero:  tensor(207, device='cuda:0')
Test loss: 0.3941, Test accuracy: 0.8377





In [None]:
torch.backends.cudnn.enabled=False
torch.save(model.state_dict(), 'news_lstm_model.pth')
true_features, false_features = evaluate_model_features(model, 'news_lstm_model.pth')
print("\nTop True Features: ")
for word, score in true_features:
  print(str(word) + ': ' + str(score))

print("Top False Features: ")
for word, score in false_features:
  print(str(word) + ': ' + str(score))

  0%|          | 0/10 [01:46<?, ?it/s]


KeyboardInterrupt: ignored

## Build, Train, and Test NewsBertCNNModel

In [None]:
class NewsBertCNNModel(nn.Module):
    def __init__(self, num_filters, filter_sizes, dropout, output_dim=2):
        super(NewsBertCNNModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
        self.max_len = 512
        self.bert = BertModel.from_pretrained("prajjwal1/bert-tiny")
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, self.bert.config.hidden_size))
            for filter_size in filter_sizes
        ])
        self.dropout_second = nn.Dropout(dropout)
        self.prediction_layer = nn.Linear(num_filters * len(filter_sizes), output_dim)

    def forward(self, title, text):
        title_and_text = [a + ' ' + b for a, b in zip(title, text)]
        encoding = self.tokenizer.batch_encode_plus(
            title_and_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        text_embedding = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0].to(device)
        text_embedding = self.dropout(text_embedding).to(device)
        text_embedding = text_embedding.unsqueeze(1)
        conv_results = [F.relu(conv(text_embedding)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_results]
        cat = torch.cat(pooled_results, dim=-1)
        out = self.dropout_second(cat)
        out = self.prediction_layer(cat)
        return out

In [None]:
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 2
DROPOUT = 0.5
model = NewsBertCNNModel(NUM_FILTERS, FILTER_SIZES, DROPOUT).to(device)
print(model)
# loss_fn = nn.CrossEntropyLoss().to(device)
# optimizer = optim.Adam(model.parameters(), lr=1e-3)
# train_model(model, 15, train_loader, val_loader, loss_fn, optimizer, use_raw_data=True)

NewsBertCNNModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_af

In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn, use_raw_data=True)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

100%|██████████| 10/10 [00:04<00:00,  2.36it/s]

One:  tensor(308, device='cuda:0')
Zero:  tensor(0, device='cuda:0')
Test loss: 0.8933, Test accuracy: 0.3604





## Build, Train, and Test NewsBertLSTMModel

In [None]:
from unicodedata import bidirectional
class NewsBertLSTMModel(nn.Module):
    def __init__(self, hidden_dim, dropout, output_dim=2):
        super(NewsBertLSTMModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
        self.max_len = 512
        self.bert = BertModel.from_pretrained("prajjwal1/bert-tiny")
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, batch_first=True, dropout=dropout, bidirectional=True)
        self.prediction_layer = nn.Linear(2 * hidden_dim, output_dim)

    def forward(self, title, text):
        title_and_text = [a + ' ' + b for a, b in zip(title, text)]
        encoding = self.tokenizer.batch_encode_plus(
            title_and_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        text_embedding = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0].to(device)
        text_embedding = self.dropout(text_embedding).to(device)
        lstm_output, _ = self.lstm(text_embedding)
        lstm_output = torch.max(lstm_output, dim=1)[0]
        out = self.prediction_layer(lstm_output)
        return out


In [None]:
HIDDEN_DIM = 100
DROPOUT = 0.5
model = NewsBertLSTMModel(hidden_dim=HIDDEN_DIM, dropout=DROPOUT).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
train_model(model, 5, train_loader, val_loader, loss_fn, optimizer, use_raw_data=True)

In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn, use_raw_data=True)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

## Build, Train, and Test NewsCNNLSTMModel

In [None]:
class NewsCNNLSTMModel(nn.Module):
    def __init__(self, num_filters, filter_sizes, hidden_dim, dropout, output_dim=2, embedding_dim=300):
        super(NewsCNNLSTMModel, self).__init__()
        vocab_size = len(built_vocab)
        embedding_weights = self.get_embedding_weights(vocab_size, embedding_dim)
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.embedding_layer.weight.data.copy_(embedding_weights)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_dim))
            for filter_size in filter_sizes
        ])
        self.lstm = nn.LSTM(input_size=num_filters * len(filter_sizes), hidden_size=hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)
        self.prediction_layer = nn.Linear(2 * hidden_dim, output_dim)

    def get_embedding_weights(self, vocab_size, embedding_dim):
        glove_embeddings = GloVe(name='6B', dim=embedding_dim)
        embedding_weights = []
        for i in range(vocab_size):
            word = index_to_word[i]
            if word in glove_embeddings.stoi:
                embedding_weights.append(glove_embeddings.vectors[glove_embeddings.stoi[word]])
            else:
                embedding_weights.append(torch.randn(embedding_dim))
        embedding_weights = torch.stack(embedding_weights)
        return embedding_weights

    def forward(self, title, text):
        title_embedded = self.embedding_layer(title)
        text_embedded = self.embedding_layer(text)
        embedding = torch.cat((title_embedded, text_embedded), dim=1)
        embedding = embedding.unsqueeze(1)
        conv_results = [F.relu(conv(embedding)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_results]
        cat = torch.cat(pooled_results, dim=-1)
        lstm_out, _ = self.lstm(cat)
        outs = self.prediction_layer(lstm_out)
        return outs

In [None]:
NUM_FILTERS = 10
FILTER_SIZES = [3, 4, 5]
HIDDEN_DIM = 100
DROPOUT = 0.6
model = NewsCNNLSTMModel(num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES, hidden_dim=HIDDEN_DIM, dropout=DROPOUT).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
train_model(model, 15, train_loader, val_loader, loss_fn, optimizer)

Epoch 1/15


100%|██████████| 45/45 [00:14<00:00,  3.10it/s]


Train loss: 0.6620, Train accuracy: 0.6183


100%|██████████| 10/10 [00:01<00:00,  6.62it/s]


One:  tensor(0, device='cuda:0')
Zero:  tensor(307, device='cuda:0')
Validation loss: 0.6440, Validation accuracy: 0.6775
Epoch 2/15


100%|██████████| 45/45 [00:14<00:00,  3.05it/s]


Train loss: 0.6144, Train accuracy: 0.6448


100%|██████████| 10/10 [00:01<00:00,  6.92it/s]


One:  tensor(23, device='cuda:0')
Zero:  tensor(284, device='cuda:0')
Validation loss: 0.6168, Validation accuracy: 0.6743
Epoch 3/15


100%|██████████| 45/45 [00:14<00:00,  3.21it/s]


Train loss: 0.4285, Train accuracy: 0.8200


100%|██████████| 10/10 [00:01<00:00,  6.82it/s]


One:  tensor(147, device='cuda:0')
Zero:  tensor(160, device='cuda:0')
Validation loss: 0.6562, Validation accuracy: 0.6026
Epoch 4/15


100%|██████████| 45/45 [00:14<00:00,  3.21it/s]


Train loss: 0.1673, Train accuracy: 0.9637


100%|██████████| 10/10 [00:01<00:00,  7.19it/s]


One:  tensor(158, device='cuda:0')
Zero:  tensor(149, device='cuda:0')
Validation loss: 0.7455, Validation accuracy: 0.5928
Epoch 5/15


100%|██████████| 45/45 [00:13<00:00,  3.22it/s]


Train loss: 0.0500, Train accuracy: 0.9923


100%|██████████| 10/10 [00:01<00:00,  7.31it/s]


One:  tensor(97, device='cuda:0')
Zero:  tensor(210, device='cuda:0')
Validation loss: 0.6878, Validation accuracy: 0.7068
Epoch 6/15


100%|██████████| 45/45 [00:13<00:00,  3.22it/s]


Train loss: 0.0145, Train accuracy: 0.9986


100%|██████████| 10/10 [00:01<00:00,  7.33it/s]


One:  tensor(116, device='cuda:0')
Zero:  tensor(191, device='cuda:0')
Validation loss: 0.7484, Validation accuracy: 0.6840
Epoch 7/15


100%|██████████| 45/45 [00:14<00:00,  3.19it/s]


Train loss: 0.0080, Train accuracy: 0.9986


100%|██████████| 10/10 [00:01<00:00,  6.79it/s]


One:  tensor(154, device='cuda:0')
Zero:  tensor(153, device='cuda:0')
Validation loss: 0.8839, Validation accuracy: 0.6124
Epoch 8/15


100%|██████████| 45/45 [00:14<00:00,  3.19it/s]


Train loss: 0.0068, Train accuracy: 0.9993


100%|██████████| 10/10 [00:01<00:00,  7.32it/s]


One:  tensor(108, device='cuda:0')
Zero:  tensor(199, device='cuda:0')
Validation loss: 0.7988, Validation accuracy: 0.7036
Epoch 9/15


100%|██████████| 45/45 [00:14<00:00,  3.20it/s]


Train loss: 0.0062, Train accuracy: 0.9993


100%|██████████| 10/10 [00:01<00:00,  7.17it/s]


One:  tensor(149, device='cuda:0')
Zero:  tensor(158, device='cuda:0')
Validation loss: 0.9343, Validation accuracy: 0.6287
Epoch 10/15


100%|██████████| 45/45 [00:14<00:00,  3.21it/s]


Train loss: 0.0037, Train accuracy: 0.9993


100%|██████████| 10/10 [00:01<00:00,  6.86it/s]


One:  tensor(95, device='cuda:0')
Zero:  tensor(212, device='cuda:0')
Validation loss: 0.8273, Validation accuracy: 0.7199
Epoch 11/15


100%|██████████| 45/45 [00:14<00:00,  3.11it/s]


Train loss: 0.0020, Train accuracy: 1.0000


100%|██████████| 10/10 [00:01<00:00,  6.75it/s]


One:  tensor(85, device='cuda:0')
Zero:  tensor(222, device='cuda:0')
Validation loss: 0.8504, Validation accuracy: 0.7264
Epoch 12/15


100%|██████████| 45/45 [00:14<00:00,  3.02it/s]


Train loss: 0.0014, Train accuracy: 1.0000


100%|██████████| 10/10 [00:01<00:00,  6.74it/s]


One:  tensor(85, device='cuda:0')
Zero:  tensor(222, device='cuda:0')
Validation loss: 0.8680, Validation accuracy: 0.7199
Epoch 13/15


100%|██████████| 45/45 [00:14<00:00,  3.20it/s]


Train loss: 0.0010, Train accuracy: 1.0000


100%|██████████| 10/10 [00:01<00:00,  6.95it/s]


One:  tensor(90, device='cuda:0')
Zero:  tensor(217, device='cuda:0')
Validation loss: 0.8797, Validation accuracy: 0.7166
Epoch 14/15


100%|██████████| 45/45 [00:14<00:00,  3.19it/s]


Train loss: 0.0008, Train accuracy: 1.0000


100%|██████████| 10/10 [00:01<00:00,  7.29it/s]


One:  tensor(93, device='cuda:0')
Zero:  tensor(214, device='cuda:0')
Validation loss: 0.8905, Validation accuracy: 0.7264
Epoch 15/15


100%|██████████| 45/45 [00:14<00:00,  3.20it/s]


Train loss: 0.0007, Train accuracy: 1.0000


100%|██████████| 10/10 [00:01<00:00,  7.28it/s]

One:  tensor(96, device='cuda:0')
Zero:  tensor(211, device='cuda:0')
Validation loss: 0.9004, Validation accuracy: 0.7231





In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

100%|██████████| 10/10 [00:01<00:00,  7.43it/s]

One:  tensor(108, device='cuda:0')
Zero:  tensor(200, device='cuda:0')
Test loss: 0.6654, Test accuracy: 0.7565





In [None]:
torch.backends.cudnn.enabled=False
torch.save(model.state_dict(), 'news_cnn_lstm_model.pth')
true_features, false_features = evaluate_model_features(model, 'news_cnn_lstm_model.pth')
print("\nTop True Features: ")
for word, score in true_features:
  print(str(word) + ': ' + str(score))

print("Top False Features: ")
for word, score in false_features:
  print(str(word) + ': ' + str(score))

100%|██████████| 10/10 [01:53<00:00, 11.36s/it]


Top True Features: 
print: 3.3059271474049745e-05
pay: 1.3497917818197894e-05
they: 1.0831680625378735e-05
should: 1.0275291545175875e-05
everyone: 9.781218068055504e-06
control: 9.519836190425666e-06
did: 8.554549664184063e-06
entire: 8.281229717616209e-06
stole: 7.3423176914480846e-06
family: 7.342038291567793e-06
Top False Features: 
the: 0.0
to: 0.0
years: 0.0
another: 0.0
somalis: 0.0
this: 0.0
one: 0.0
where: 0.0
taxpayers: 7.136245616094858e-08
again: 1.1137720924149857e-07





## Build, Train, and Test NewsBertCNNLSTMModel

In [None]:
class NewsBertCNNLSTMModel(nn.Module):
    def __init__(self, num_filters, filter_sizes, hidden_dim, dropout, output_dim=2):
        super(NewsBertCNNLSTMModel, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
        self.max_len = 512
        self.bert = BertModel.from_pretrained("prajjwal1/bert-tiny")
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, self.bert.config.hidden_size))
            for filter_size in filter_sizes
        ])
        self.lstm = nn.LSTM(input_size=num_filters * len(filter_sizes), hidden_size=hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)
        self.prediction_layer = nn.Linear(2 * hidden_dim, output_dim)

    def forward(self, title, text):
        title_and_text = [a + ' ' + b for a, b in zip(title, text)]
        encoding = self.tokenizer.batch_encode_plus(
            title_and_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        text_embedding = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0].to(device)
        text_embedding = self.dropout(text_embedding).to(device)
        text_embedding = text_embedding.unsqueeze(1)
        conv_results = [F.relu(conv(text_embedding)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conv_results]
        cat = torch.cat(pooled_results, dim=-1)
        lstm_out, _ = self.lstm(cat)
        outs = self.prediction_layer(lstm_out)
        return outs

In [None]:
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
HIDDEN_DIM = 100
DROPOUT = 0.5
model = NewsBertCNNLSTMModel(num_filters=NUM_FILTERS, filter_sizes=FILTER_SIZES, hidden_dim=HIDDEN_DIM, dropout=0.5).to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
train_model(model, 5, train_loader, val_loader, loss_fn, optimizer, use_raw_data=True)

In [None]:
test_loss, test_acc = eval_model(model, test_loader, loss_fn, use_raw_data=True)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

##Linear Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

# Create vectorizer and fit it to the entire dataset
vectorizer = CountVectorizer()
# all_data_df['title'] + ' ' +
features = vectorizer.fit_transform(all_data_df['text'])

train_size = int(0.8 * len(all_data_df))
train_features, train_labels = features[:train_size], all_data_df['label'][:train_size]
test_features, test_labels = features[train_size:], all_data_df['label'][train_size:]

# Fit logistic regression model to the training data
clf = LogisticRegression(max_iter=1000)
clf.fit(train_features, train_labels)

# Evaluate the model on the test data
pred_labels = clf.predict(test_features)
accuracy = accuracy_score(test_labels, pred_labels)
print(f"Accuracy: {accuracy}")

In [None]:
# Get feature names and coefficients
feature_names = np.array(vectorizer.get_feature_names_out())
coefficients = clf.coef_[0]

# Get top feature indices and their coefficients for title
top_indices = coefficients.argsort()[-20:][::-1]
top_features = feature_names[top_indices]
top_coefficients = coefficients[top_indices]

print("Top Features:")
i = 0
for feature_name in top_features:
  coefficient = top_coefficients[i]
  print(f"{feature_name}: {coefficient}")
  i += 1

In [None]:
# Get bottom feature indices and their coefficients for title
bottom_indices = coefficients.argsort()[:20]
bottom_features = feature_names[bottom_indices]
bottom_coefficients = coefficients[bottom_indices]
print("Bottom Features:")
i = 0
for feature_name in bottom_features:
  coefficient = bottom_coefficients[i]
  print(f"{feature_name}: {coefficient}")
  i += 1