## Import Data

In [None]:
# Import pandas and load training data (the liar dataset from github)
import pandas as pd
train_data = pd.read_csv('https://raw.githubusercontent.com/Tariq60/LIAR-PLUS/refs/heads/master/dataset/tsv/train2.tsv', sep = "\t")
train_data.head(1)

In [None]:
# Column 1: the ID of the statement ([ID].json).
# Column 2: the label.
# Column 3: the statement.
# Column 4: the subject(s).
# Column 5: the speaker.
# Column 6: the speaker's job title.
# Column 7: the state info.
# Column 8: the party affiliation.
# Columns 9-13: the total credit history count, including the current statement.
# 9: barely true counts.
# 10: false counts.
# 11: half true counts.
# 12: mostly true counts.
# 13: pants on fire counts.
# Column 14: the context (venue / location of the speech or statement).
# Column 15: the extracted justification

In [None]:
# since there is no column names for this data initially, we kept the column names as our first data to put it back to
# the dataframe later
first_data = train_data.columns

In [None]:
# naming the columns and adding the first data we extracted back to the end of the dataframe
# also droped the index column because it serves the same purpose as the ID of statement column
train_data.loc[train_data.shape[0]] = first_data
train_data.columns =['index','ID of statement', 'label', 'statement', 'subject', 'speaker', "speaker's job title", 'state info',
                     'party affiliation', 'barely true counts', 'false counts', 'half true counts', 'mostly true counts',
                    'pants on fire counts', 'context', 'extracted justification']
train_data = train_data.drop(columns=['index'])

## Data Cleaning

In [None]:
# label -> clean :)
# subject -> maybe change str to list
# speaker -> clean :)
# speaker's job title -> need to fix big and little letters
# State info -> clean :)
# party affiliation -> clean :)
# barely true counts -> not gonna modify
# false counts -> not gonna modify
# half true counts -> not gonna modify
# mostly true counts -> not gonna modify
# pants on fire counts -> not gonna modify
# context -> assume clean :)
# extracted -> str to list

In [None]:
# subject
train_data['subject'] = train_data['subject'].str.split(",")

# speaker's job title
train_data["speaker's job title"] = train_data["speaker's job title"].str.lower()

# extracted
train_data["extracted justification"] = train_data["extracted justification"].str.split(" ")

train_data.head(3)

## Factuality Factor

* Social Credibility: People are more likely to perceive a source as credible if others perceive the source is credible
* Stance Detection: What is the political or issue stance of the article or text corpus? How does that affect the veracity of the article or text?

#### Social Credibility
* Source History: Delve into the past of the post or source to understand its track record
* Enforsement checks: a post or source that has been enorsed or validated by external reputable entities gains credibility
* revision Analysis: check if the content has been revised updated, or retracted in the past

* Basic info:
    * 10243 total rows in df
    * 4346 types of unique context
    * top five context
        * a news release                                                  241
        * an interview                                                    229
        * a press release                                                 223
        * a speech                                                        214
        * a TV ad                                                         180

In [None]:
# all imports
import numpy as np
import scipy
import sklearn
import keras

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = "ignore", sparse = False)

# citation: https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
# dropping null
train_data = train_data.dropna()

# keeping only 4 columns
training_data = train_data[["speaker", "context", "party affiliation"]]
train_label = train_data[['label']]

# ohe (data)
ohe_label = ohe.fit_transform(np.array(train_label['label']).reshape(-1,1))
ohe_label_df = pd.DataFrame(ohe_label, columns = list(train_label['label'].unique()))
ohe_speaker = ohe.fit_transform(np.array(training_data['speaker']).reshape(-1,1))
ohe_context = ohe.fit_transform(np.array(training_data['context']).reshape(-1,1))
ohe_party = ohe.fit_transform(np.array(training_data['party affiliation']).reshape(-1,1))
ohe_speaker_ohe_context_ohe_party = []
for i in range(len(ohe_speaker)):
    ohe_speaker_ohe_context_ohe_party.append(np.concatenate((ohe_speaker[i], ohe_context[i], ohe_party[i])))
ohe_speaker_ohe_context_ohe_party = np.array(ohe_speaker_ohe_context_ohe_party)

# grouping the ohe of the training data back 
small_testing_ohe_speaker_context_party_df = pd.DataFrame(ohe_speaker_ohe_context_ohe_party, columns = 
                                             list(training_data['speaker'].unique()) 
                                             + list(training_data['context'].unique())
                                                + list(training_data['party affiliation'].unique()))


# splitting the data into 8:2 for training and testing data
training_data = small_testing_ohe_speaker_context_party_df[0: int(small_testing_ohe_speaker_context_party_df.shape[0]*0.8)]
training_label = ohe_label_df[0: int(small_testing_ohe_speaker_context_party_df.shape[0]*0.8)]

test_data = small_testing_ohe_speaker_context_party_df[int(small_testing_ohe_speaker_context_party_df.shape[0]*0.8):int(small_testing_ohe_speaker_context_party_df.shape[0])]
test_label = ohe_label_df[int(small_testing_ohe_speaker_context_party_df.shape[0]*0.8):int(small_testing_ohe_speaker_context_party_df.shape[0])]
# citation: https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [None]:
# algorithm/method #1 (keras simple feed forward neural network)
from keras.models import Sequential
from keras import layers
from keras.layers import Dense

nn = keras.Sequential([
    keras.Input(shape=(training_data.shape[1])),
    layers.Dense(10, activation='sigmoid'),
    layers.Dense(10, activation='sigmoid'),
    layers.Dense(10, activation='sigmoid'),
    layers.Dense(6),
])

nn.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.01), loss = 'mse', metrics = "accuracy")

nn.fit(training_data, training_label, batch_size = 10, epochs = 100)

# citation: https://keras.io/guides/sequential_model/

In [None]:
# creating predictions for the test data with keras model
predictions = nn.predict(test_data)

In [None]:
# since the output of the model is a list with 6 numbers each corresponding to each label
# need to find the one with the highest probabiltiy and make that label 1 and others 0
cleaned_output = [[1 if pred[i] == max(pred) else 0 for i in range(len(pred))] for pred in predictions]

In [None]:
# converting the test datset's labels to list for later comparison to check accuracy
cleaned_test_label = test_label.values.tolist()

In [None]:
# counting the number of correct predictions
correct = 0
for i in range(len(cleaned_output)):
    if cleaned_output[i] == cleaned_test_label[i]:
        correct += 1
        
correct

In [None]:
# calculate testing accuracy in percentage
correct / len(cleaned_output)

#### The rest is for other model testing, but we decided to keep the one above since it worked the best.

In [None]:
# # algorithm/method #2
# nn = keras.Sequential([
#     keras.Input(shape=(small_training_ohe_speaker_context_df.shape[1])),
#     layers.Dense(10, activation='relu'),
#     layers.Dense(10, activation='relu'),
#     layers.Dense(10, activation='relu'),
#     layers.Dense(6),
# ])

# nn.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.01), loss = 'mse', metrics = "accuracy")

# nn.fit(small_training_ohe_speaker_context_df, small_training_ohe_label_df, batch_size = 10, epochs = 50)

# # citation: https://keras.io/guides/sequential_model/

In [None]:
# # algorithm/method #3
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import cross_val_score
# dt = DecisionTreeClassifier(criterion= "entropy", splitter = "best")

# accuracy = cross_val_score(dt, small_training_ohe_speaker_context_df, small_training_ohe_label_df, cv=5)

# accuracy

# # citation: https://scikit-learn.org/dev/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# # https://scikit-learn.org/stable/modules/cross_validation.html

#### Stance Detection (Political Affiliation) -> will we working on this using GenAI
* Language Inspection: Scrutinize content for language indicative or political inclination
* Disclosure Checks: Ensure any affiliations by the author or source are openly disclosed
* Fact-checker comparison: contrast content claims against neutal, non-partisan fact-checkers