# Fake News Detection <font color='ecb400'> LIAR Dataset </font>

In [1]:
import pandas as pd
import numpy as np

# using pandas but I prefer polars instead

### <font color='AFBADC'> Preprocessing </font>

In [2]:
column_names=[
        'id',                # Column 1: the ID of the statement ([ID].json).
        'label',             # Column 2: the label.
        'statement',         # Column 3: the statement.
        'subjects',          # Column 4: the subject(s).
        'speaker',           # Column 5: the speaker.
        'speaker_job_title', # Column 6: the speaker's job title.
        'state_info',        # Column 7: the state info.
        'party_affiliation', # Column 8: the party affiliation.
        
        # Column 9-13: the total credit history count, including the current statement.
        'count_1', # barely true counts.
        'count_2', # false counts.
        'count_3', # half true counts.
        'count_4', # mostly true counts.
        'count_5', # pants on fire counts.

        'context' # Column 14: the context (venue / location of the speech or statement).
]

# The data is already split into train, test, and validation sets.
# [REFERENCE]: https://www.cs.ucsb.edu/~william/data/liar_dataset.zip
# We are using this for convenience and consistency
train_data = pd.read_csv('./liar_dataset/train.tsv', sep='\t', header=None, names=column_names)
test_data  = pd.read_csv('./liar_dataset/test.tsv',  sep='\t', header=None, names=column_names)
valid_data = pd.read_csv('./liar_dataset/valid.tsv', sep='\t', header=None, names=column_names)

In [3]:
train_data.head(3)

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job_title,state_info,party_affiliation,count_1,count_2,count_3,count_4,count_5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver


<font size=5> For the initial baseline We'll start with binary classification

<font size=3.5>[PolitiFact Descriptions of labels](https://www.politifact.com/article/2018/feb/12/principles-truth-o-meter-politifacts-methodology-i/#Truth-O-Meter%20ratings)

<font size=3.5>[Binary grouping in some paper](https://aclanthology.org/W18-5513.pdf)

In [4]:
train_data['label'].value_counts()

label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64

In [8]:
def binarize_labels(data: pd.DataFrame) -> pd.DataFrame:
    """Reduces 6-way classification to binary classification"""
    # We may change this if we want
    # false_labels= ['barely-true', 'false', 'pants-fire']
    true_labels= ['true', 'mostly-true', 'half-true']

    data['label'] = data['label'].apply(lambda x: 1 if x in true_labels else 0)
    
    return data

train_data = binarize_labels(train_data)
test_data  = binarize_labels(test_data)
valid_data = binarize_labels(valid_data)


In [6]:
train_data['label'].value_counts()

label
1    5752
0    4488
Name: count, dtype: int64