### Fake News Detections

[A Survey on Natural Language Processing for Fake News Detection](https://aclanthology.org/2020.lrec-1.747.pdf)

In [283]:
import pandas as pd
import re
from sklearn.metrics import classification_report

In [403]:
def preprocess(text: str) -> str:
    text = text.lower()
    return re.sub(r"[^\w\s]", "", text)

def get_df(fiepath: str, label_map={ "true": 1, "false": 0 }) -> pd.DataFrame:
    df = pd.read_csv(fiepath, sep='\t', header=None)
    data = pd.DataFrame()
    df = df[df[1].isin(label_map.keys())]
    data["text"] = df[2].apply(preprocess)
    data["label"] = df[1].apply(lambda l: label_map[l])
    return data

train_df = get_df("data/liar_train.tsv")
test_df = get_df("data/liar_test.tsv")

train_df.head()

Unnamed: 0,text,label
0,says the annies list political group supports ...,0
3,health care reform legislation is likely to ma...,0
5,the chicago bears have had more starting quart...,1
12,when mitt romney was governor of massachusetts...,0
16,mccain opposed a requirement that the governme...,1


In [404]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_train_features = vectorizer.fit_transform(train_df["text"])
x_test_features = vectorizer.transform(test_df["text"])

In [405]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_features,  train_df["label"])

train_pred = lr.predict(x_train_features)
test_pred = lr.predict(x_test_features)

print("Train Scores:")
print(classification_report(train_df["label"], train_pred, target_names=["true", "false"]))
print("Test Scores:")
print(classification_report(test_df["label"], test_pred, target_names=["true", "false"]))

Train Scores:
              precision    recall  f1-score   support

        true       0.82      0.90      0.86      1995
       false       0.86      0.76      0.81      1676

    accuracy                           0.84      3671
   macro avg       0.84      0.83      0.83      3671
weighted avg       0.84      0.84      0.83      3671

Test Scores:
              precision    recall  f1-score   support

        true       0.64      0.72      0.68       249
       false       0.61      0.52      0.56       208

    accuracy                           0.63       457
   macro avg       0.62      0.62      0.62       457
weighted avg       0.63      0.63      0.62       457



In [406]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(x_train_features,  train_df["label"])

train_pred = rfc.predict(x_train_features)
test_pred = rfc.predict(x_test_features)

print("Train Scores:")
print(classification_report(train_df["label"], train_pred, target_names=["true", "false"]))
print("Test Scores:")
print(classification_report(test_df["label"], test_pred, target_names=["true", "false"]))

Train Scores:
              precision    recall  f1-score   support

        true       1.00      1.00      1.00      1995
       false       1.00      1.00      1.00      1676

    accuracy                           1.00      3671
   macro avg       1.00      1.00      1.00      3671
weighted avg       1.00      1.00      1.00      3671

Test Scores:
              precision    recall  f1-score   support

        true       0.62      0.78      0.69       249
       false       0.61      0.42      0.50       208

    accuracy                           0.61       457
   macro avg       0.61      0.60      0.59       457
weighted avg       0.61      0.61      0.60       457



In [408]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
x_train_features = count_vectorizer.fit_transform(train_df["text"])
x_test_features = count_vectorizer.transform(test_df["text"])

In [410]:
from sklearn.naive_bayes import GaussianNB

lr = GaussianNB()
lr.fit(x_train_features.toarray(),  train_df["label"])

train_pred = lr.predict(x_train_features.toarray())
test_pred = lr.predict(x_test_features.toarray())

print("Train Scores:")
print(classification_report(train_df["label"], train_pred,target_names=["true", "false"]))
print("Test Scores:")
print(classification_report(test_df["label"], test_pred, target_names=["true", "false"]))

Train Scores:
              precision    recall  f1-score   support

        true       1.00      0.78      0.88      1995
       false       0.79      1.00      0.88      1676

    accuracy                           0.88      3671
   macro avg       0.90      0.89      0.88      3671
weighted avg       0.90      0.88      0.88      3671

Test Scores:
              precision    recall  f1-score   support

        true       0.63      0.40      0.49       249
       false       0.50      0.73      0.59       208

    accuracy                           0.55       457
   macro avg       0.57      0.56      0.54       457
weighted avg       0.57      0.55      0.54       457



### CNN for fake news detection

[“Liar, Liar Pants on Fire”: A New Benchmark Dataset for Fake News Detection](https://arxiv.org/pdf/1705.00648)

[Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882)

In [447]:
import torch
import torch.nn as nn
import nltk 
from nltk.tokenize import word_tokenize

nltk.download("punkt_tab")

multi_class = { "true": 0, "mostly-true": 1, "half-true": 2, "barely-true": 3, "false": 4, "pants-fire": 5 }

train_df = get_df("data/liar_train.tsv", label_map=multi_class)
test_df = get_df("data/liar_test.tsv", label_map=multi_class)

test_df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lapin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,text,label
0,building a wall on the usmexico border will ta...,0
1,wisconsin is on pace to double the number of l...,4
2,says john mccain has done nothing to help the ...,4
3,suzanne bonamici supports a plan that will cut...,2
4,when asked by a reporter whether hes at the ce...,5


In [448]:
PAD = "<pad>"
UNK = "<unk>"

vocab = { token for statement in pd.concat([train_df["text"], test_df["text"]]) for token in word_tokenize(statement) }
vocab.add(PAD)
vocab.add(UNK)

vocab_idx = { word:idx for idx, word in enumerate(vocab) }

print("Vocab size:", len(vocab_idx))

Vocab size: 14214


In [463]:
type Dim = 50 | 100 | 200 | 300

EMBEDDING_DIM = 100

def get_embeddings(vocab: dict[str, int], dim: Dim = 100) -> nn.Embedding:
    embedding_matrix = torch.zeros(len(vocab), dim)
    with open(f'glove/glove.6B.{dim}d.txt', "r", encoding="utf-8") as file:
        for line in file:
            tokens = line.split()
            word = tokens[0]
            idx = vocab.get(word)
            if idx is not None:
                embedding_matrix[idx] = torch.tensor([float(val) for val in tokens[1:]], dtype=torch.float32)

    return nn.Embedding.from_pretrained(embedding_matrix)

embedding_layer = get_embeddings(vocab_idx, dim=EMBEDDING_DIM)

In [464]:
STATEMENT_DIM = 32

def text_batch_to_idx_tensor(text_batch: list[str], vocab_idx: dict[str, int], length=STATEMENT_DIM) -> torch.Tensor:
    pad_idx = vocab_idx[PAD]
    unk_idx = vocab_idx[UNK]
    or_else = lambda this, that: this if this is not None else that

    def pad_list(lst):
        return lst[:length] + [pad_idx] * max(0, length - len(lst))

    return torch.tensor(
        [pad_list([or_else(vocab_idx.get(word), unk_idx) for word in word_tokenize(text)]) for text in text_batch]
    )


In [471]:
class Transpose(nn.Module):
    def __init__(self, dim1, dim2):
        super().__init__()
        self.dim1 = dim1
        self.dim2 = dim2

    def forward(self, x):
        return x.transpose(self.dim1, self.dim2)

model = nn.Sequential(
    embedding_layer,
    Transpose(1, 2),
    nn.Conv1d(in_channels=EMBEDDING_DIM, out_channels=64, kernel_size=5),
    nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3),
    nn.Conv1d(in_channels=32, out_channels=16, kernel_size=3),
    nn.MaxPool1d(kernel_size=3),
    nn.Flatten(),
    nn.Linear(16 * 8, 6),
    nn.Softmax(1)
)


# data = text_batch_to_idx_tensor(train_df["text"], vocab_idx)
# print(model(data).shape)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [474]:
EPOCHS = 100

data = text_batch_to_idx_tensor(train_df["text"], vocab_idx)
labels = torch.tensor(train_df["label"].tolist())

for epoch in range(EPOCHS):
    outputs = model(data)
    loss = loss_fn(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss)


tensor(1.6735, grad_fn=<NllLossBackward0>)
tensor(1.6726, grad_fn=<NllLossBackward0>)
tensor(1.6716, grad_fn=<NllLossBackward0>)
tensor(1.6689, grad_fn=<NllLossBackward0>)
tensor(1.6670, grad_fn=<NllLossBackward0>)
tensor(1.6664, grad_fn=<NllLossBackward0>)
tensor(1.6648, grad_fn=<NllLossBackward0>)
tensor(1.6627, grad_fn=<NllLossBackward0>)
tensor(1.6615, grad_fn=<NllLossBackward0>)
tensor(1.6602, grad_fn=<NllLossBackward0>)
tensor(1.6583, grad_fn=<NllLossBackward0>)
tensor(1.6568, grad_fn=<NllLossBackward0>)
tensor(1.6558, grad_fn=<NllLossBackward0>)
tensor(1.6542, grad_fn=<NllLossBackward0>)
tensor(1.6526, grad_fn=<NllLossBackward0>)
tensor(1.6518, grad_fn=<NllLossBackward0>)
tensor(1.6511, grad_fn=<NllLossBackward0>)
tensor(1.6503, grad_fn=<NllLossBackward0>)
tensor(1.6489, grad_fn=<NllLossBackward0>)
tensor(1.6461, grad_fn=<NllLossBackward0>)
tensor(1.6445, grad_fn=<NllLossBackward0>)
tensor(1.6444, grad_fn=<NllLossBackward0>)
tensor(1.6428, grad_fn=<NllLossBackward0>)
tensor(1.64

In [475]:
model.eval()

with torch.no_grad():
    data = text_batch_to_idx_tensor(train_df["text"], vocab_idx)
    labels = torch.tensor(train_df["label"].tolist())
    outputs = model(data)
    preds = torch.argmax(outputs, dim=1)

    print(classification_report(labels, preds, labels=[0,1,2,3,4,5], target_names=["true", "mostly-true", "half-true", "barely-true", "false", "pants-fire"]))

    data = text_batch_to_idx_tensor(test_df["text"], vocab_idx)
    labels = torch.tensor(test_df["label"].tolist())
    outputs = model(data)
    preds = torch.argmax(outputs, dim=1)

    print(classification_report(labels, preds, labels=[0,1,2,3,4,5], target_names=["true", "mostly-true", "half-true", "barely-true", "false", "pants-fire"]))

              precision    recall  f1-score   support

        true       0.56      0.33      0.42      1676
 mostly-true       0.52      0.60      0.56      1962
   half-true       0.49      0.63      0.55      2114
 barely-true       0.48      0.47      0.47      1654
       false       0.46      0.60      0.52      1995
  pants-fire       0.00      0.00      0.00       839

    accuracy                           0.49     10240
   macro avg       0.42      0.44      0.42     10240
weighted avg       0.46      0.49      0.47     10240

              precision    recall  f1-score   support

        true       0.25      0.14      0.18       208
 mostly-true       0.26      0.30      0.28       241
   half-true       0.22      0.31      0.26       265
 barely-true       0.20      0.19      0.19       212
       false       0.24      0.29      0.26       249
  pants-fire       0.00      0.00      0.00        92

    accuracy                           0.23      1267
   macro avg       0.20

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
