### Data Cleaning and Processing

In [121]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [122]:
df = pd.read_csv('../scraper/datasets/latest.csv')
le = LabelEncoder()
cdf = df.copy()
cdf['truth_value'] = le.fit_transform(cdf['truth_value'])

In [123]:
stemmer = SnowballStemmer("english")
def stemm_text(text):
    return ' '.join([stemmer.stem(w) for w in text.split(' ')])

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

T = cdf['claim'].str.split(' \n\n---\n\n').str[0]
T = T.str.replace('-',' ').str.replace('[^\w\s]','').str.replace('\n',' ').str.lower()
stop = stopwords.words('english')
T = T.apply(lambda x: ' '.join([y for y in x.split() if not y.isdigit()]))
T = T.apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))
cdf['claim'] = T

T = cdf['simple_sentence'].str.split(' \n\n---\n\n').str[0]
T = T.str.replace('-',' ').str.replace('[^\w\s]','').str.replace('\n',' ').str.lower()
stop = stopwords.words('english')
T = T.apply(lambda x: ' '.join([y for y in x.split() if not y.isdigit()]))
T = T.apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))
cdf['simple_sentence'] = T
cdf.head(10)

  T = T.str.replace('-',' ').str.replace('[^\w\s]','').str.replace('\n',' ').str.lower()
  T = T.str.replace('-',' ').str.replace('[^\w\s]','').str.replace('\n',' ').str.lower()


Unnamed: 0.1,Unnamed: 0,claim,truth_value,source,simple_sentence
0,0,ukraine theft homicide levels rose due power o...,0,vox-ukraine,ukraine theft homicide levels rose due power o...
1,1,ukrainians beat two berlin residents speaking ...,0,vox-ukraine,ukrainians beat two berlin residents speaking ...
2,2,quote paul goebbels banderites,0,vox-ukraine,quote paul goebbels banderites
3,3,culture good neighborliness course ukrainian s...,0,vox-ukraine,culture good neighborliness course ukrainian s...
4,4,us research ukraine led increase incidence tic...,0,vox-ukraine,us research ukraine led increase incidence tic...
5,5,chile law rights mutants genetically modified ...,0,vox-ukraine,chile law rights mutants approved ostap stakhi...
6,6,covid incidence rate became zero late may,0,vox-ukraine,covid incidence rate became zero late may
7,7,risk death among children vaccinated covid tim...,0,vox-ukraine,risk death among children vaccinated covid tim...
8,8,russias army destroyed,0,politifact,russias army destroyed
9,9,war ukraine,0,politifact,war ukraine


In [124]:
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     certificate has expired (_ssl.c:997)>


False

In [125]:
adf = cdf.copy()
grouped = adf.groupby('truth_value')

# Sample 900 rows from each group
sampled = grouped.apply(lambda x: x.sample(n=566))

# Reset the index of the sampled data
sampled = sampled.reset_index(drop=True)
sampled.head(), sampled['truth_value'].value_counts()

(   Unnamed: 0                                              claim  truth_value  \
 0         409  missiles launched american forces iraq iran fu...            0   
 1         303  ukrainians bled died congressman budd excused ...            0   
 2         514  state union address democrats even positive ne...            0   
 3         413  says democratic leadership presidential candid...            0   
 4         165  says president donald trumps hold ukraine aid ...            0   
 
        source                                    simple_sentence  
 0  politifact  missiles launched american forces iraq iran fu...  
 1  politifact  ukrainians bled congressman budd excused kille...  
 2  politifact  state union address democrats even positive ne...  
 3  politifact  says democratic leadership mourning loss qasse...  
 4  politifact  says president donald trump hold ukraine aid l...  ,
 0    566
 1    566
 Name: truth_value, dtype: int64)

In [126]:
X = sampled[['claim', 'simple_sentence']]
y = sampled['truth_value']

X['claim'] = X['claim'].apply(lambda w: lemmatize_text(w))
X['simple_sentence'] = X['simple_sentence'].apply(lambda w: lemmatize_text(w))
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['claim'] = X['claim'].apply(lambda w: lemmatize_text(w))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['simple_sentence'] = X['simple_sentence'].apply(lambda w: lemmatize_text(w))


Unnamed: 0,claim,simple_sentence
0,missile launched american force iraq iran fund...,missile launched american force iraq iran fund...
1,ukrainian bled died congressman budd excused k...,ukrainian bled congressman budd excused killer...
2,state union address democrat even positive new...,state union address democrat even positive new...
3,say democratic leadership presidential candida...,say democratic leadership mourning loss qassem...
4,say president donald trump hold ukraine aid li...,say president donald trump hold ukraine aid li...


### Training the Model

In [127]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.preprocessing import sequence, text
from sklearn.metrics import accuracy_score
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix

In [128]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [129]:
def tokenizeAndGenerateSequences(X, y):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

    tk1 = text.Tokenizer(num_words=2000)
    tk2 = text.Tokenizer(num_words=2000)
    tk1.fit_on_texts(xtrain['claim'])
    tk2.fit_on_texts(xtrain['simple_sentence'])
    tokenized_train_claim = tk1.texts_to_sequences(xtrain['claim'])
    tokenized_train_ss = tk2.texts_to_sequences(xtrain['simple_sentence'])
    X_train_claim = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_train_claim, maxlen=60)).to(device)
    X_train_ss = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_train_ss, maxlen=60)).to(device)
    tokenized_test_claim = tk1.texts_to_sequences(xtest['claim'])
    tokenized_test_ss = tk1.texts_to_sequences(xtest['simple_sentence'])
    X_test_claim = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_test_claim, maxlen=60)).to(device)
    X_test_ss = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_test_ss, maxlen=60)).to(device)

    # Convert labels to tensors
    y_train = torch.tensor(ytrain.values).float().to(device)
    y_test = torch.tensor(ytest.values).float().to(device)
    
    return X_train_claim, X_train_ss, y_train, X_test_claim, X_test_ss, y_test

# X_train_txt, y_train_txt, X_test_txt, y_test_txt = xtrain, xtest, ytrain, ytest = train_test_split(cdf['claim'], cdf['truth_value'], test_size=0.2, random_state=42)


In [None]:
def tokenizeSentence(X, y, sentence):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
    tk1 = text.Tokenizer(num_words=2000)
    tk2 = text.Tokenizer(num_words=2000)
    tk1.fit_on_texts(xtrain['claim'])
    tk2.fit_on_texts(xtrain['simple_sentence'])
    tokenized_train_claim = tk1.texts_to_sequences([sentence, ])
    tokenized_train_ss = tk2.texts_to_sequences([simple_sentence, ])
    X_train_claim = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_train_claim, maxlen=60)).to(device)
    X_train_ss = torch.tensor(tf.keras.preprocessing.sequence.pad_sequences(tokenized_train_ss, maxlen=60)).to(device)
    y_train = torch.tensor(ytrain.values).float().to(device)

    X_train_claim, X_train_ss, y_train

In [130]:
X_train_claim, X_train_ss, y_train, X_test_claim, X_test_ss, y_test = tokenizeAndGenerateSequences(X, y)

In [131]:

# Define the model architecture
class BiLSTM(nn.Module):
    def __init__(self, num_words, embed_size, hidden_size, fc_out_size, output_size, dropout_rate):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(num_words, embed_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.bilstm1 = nn.LSTM(embed_size, hidden_size, batch_first=True, bidirectional=True)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm1(x)
        x = self.dropout(x)
        x, _ = self.bilstm2(x)
        x = self.dropout(x)
        x = self.fc(x[:, -1, :])
        return self.sigmoid(x)
    


In [132]:

# Train the model
te = 20
acc = []
tracc = []
dataset = torch.utils.data.TensorDataset(X_train_ss, y_train)
dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=1024, shuffle=True)
curtraacc = []
model = None
for e in range(1, te+1):
    ctracc = 0
    model = BiLSTM(num_words=2000, embed_size=60, hidden_size=64, fc_out_size=5000, output_size=1, dropout_rate=0.2).to(device)
    model.train()
    criterion = nn.BCELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    model.train()
    for epoch in range(e):
        optimizer.zero_grad()
        outputs = model(X_train_claim.to(device))
        loss = criterion(outputs.squeeze(), y_train.to(device))
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            predictions = model(X_train_claim.to(device))
            predictions = (predictions > 0.5).to('cpu').int().squeeze().numpy()
        train_accuracy = accuracy_score(y_train.to(device).to('cpu'), predictions)
        ctracc += train_accuracy
        curtraacc.append(train_accuracy)

    ctracc /= e

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_ss)
        predictions = (predictions > 0.5).to('cpu').int().squeeze().numpy()

    print(f"Total Epochs: {e}, Train Accuracy: {ctracc} Test Accuracy: {accuracy_score(y_test.to('cpu'), predictions)}")
    acc.append(accuracy_score(y_test.to('cpu'), predictions))
    tracc.append(ctracc)
print('Max acc -', max(acc), ' with epochs -', acc.index(max(acc)))

Total Epochs: 1, Train Accuracy: 0.666077738515901 Test Accuracy: 0.6236749116607774
Total Epochs: 2, Train Accuracy: 0.7040636042402827 Test Accuracy: 0.7137809187279152
Total Epochs: 3, Train Accuracy: 0.7031802120141343 Test Accuracy: 0.6784452296819788
Total Epochs: 4, Train Accuracy: 0.7093639575971732 Test Accuracy: 0.7084805653710248
Total Epochs: 5, Train Accuracy: 0.7321554770318022 Test Accuracy: 0.7438162544169611
Total Epochs: 6, Train Accuracy: 0.7638398115429919 Test Accuracy: 0.765017667844523
Total Epochs: 7, Train Accuracy: 0.8066633013629481 Test Accuracy: 0.7526501766784452
Total Epochs: 8, Train Accuracy: 0.7972614840989399 Test Accuracy: 0.7526501766784452
Total Epochs: 9, Train Accuracy: 0.8168433451118964 Test Accuracy: 0.7473498233215548
Total Epochs: 10, Train Accuracy: 0.800530035335689 Test Accuracy: 0.7579505300353356
Total Epochs: 11, Train Accuracy: 0.8355284291680051 Test Accuracy: 0.7632508833922261
Total Epochs: 12, Train Accuracy: 0.8596878680800942 Te

In [None]:
x_test_new = X_test_claim.detach().cpu().numpy().tolist()
y_test_new = y_test.detach().cpu().numpy().tolist()

In [None]:
m1 = BiLSTM(num_words=2000, embed_size=60, hidden_size=64, fc_out_size=5000, output_size=1, dropout_rate=0.2).to(device)
m2 = BiLSTM(num_words=2000, embed_size=60, hidden_size=64, fc_out_size=5000, output_size=1, dropout_rate=0.2).to(device)
s1 = torch.load(r'C:\Users\karun\Documents\Code\FYP-Fake-News\BiLSTM\rs.pt', map_location=torch.device('cpu'))
s2 = torch.load(r'C:\Users\karun\Documents\Code\FYP-Fake-News\BiLSTM\ss.pt', map_location=torch.device('cpu'))
m1.load_state_dict(s1)
m2.load_state_dict(s2)

<All keys matched successfully>

In [None]:
svmx = []
svmy = []

ytr = []
cury = y_test.detach().cpu().numpy()
xtr = []
for i in range(len(cury)):
    ytr.append(cury[i])
    xtr.append(X_test_ss[i].detach().cpu().numpy())
    
m1.eval()
m2.eval()
with torch.no_grad():
    p1 = m1(torch.tensor(np.array(xtr)).to(device))
    p1 = (p1 > 0.5).to('cpu').int().squeeze().numpy().tolist()
    p2 = m2(torch.tensor(np.array(xtr)).to(device))
    p2 = (p2 > 0.5).to('cpu').int().squeeze().numpy().tolist()
    svmx = [X_test_ss[i].detach().cpu().numpy().tolist() + [p1[i], p2[i]] for i in range(len(p1))]
    
    Y_TEST = torch.tensor(np.array(ytr)) 
    

    print(classification_report(Y_TEST, p1))
    print(confusion_matrix(Y_TEST, p1))
    print(classification_report(Y_TEST, p2))
    print(confusion_matrix(Y_TEST, p2))

              precision    recall  f1-score   support

         0.0       0.68      0.49      0.57       269
         1.0       0.63      0.79      0.70       297

    accuracy                           0.65       566
   macro avg       0.66      0.64      0.64       566
weighted avg       0.66      0.65      0.64       566

[[132 137]
 [ 61 236]]
              precision    recall  f1-score   support

         0.0       0.66      0.53      0.59       269
         1.0       0.64      0.76      0.69       297

    accuracy                           0.65       566
   macro avg       0.65      0.64      0.64       566
weighted avg       0.65      0.65      0.64       566

[[142 127]
 [ 72 225]]


In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(svmx, ytr)

In [None]:
preds = clf.predict(svmx)
print(classification_report(ytr, preds))
print(confusion_matrix(ytr, preds))

              precision    recall  f1-score   support

         0.0       0.71      0.81      0.76       269
         1.0       0.80      0.70      0.75       297

    accuracy                           0.75       566
   macro avg       0.76      0.76      0.75       566
weighted avg       0.76      0.75      0.75       566

[[218  51]
 [ 88 209]]
