# Fake News Detection Using Natural Language Processing and Machine Learning

# 1. Setup and imports

In [3]:
import pandas as pd
import json
import glob
import os


# 2. Load Datasets

## To make combined datset for all three datasets LIAR, FakeNewsNet, and ISOT datsets, load them separately and choose relevant coloumn for analysis. Here our primary objective is binary classification, and labels are marked accordingly. We also have retained the original labels, for future scope of multiclass classification. 

# 3. Load LIAR Dataset

In [6]:
def load_liar_data():
    cols = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state', 'party',
            'barely_true_count', 'false_count', 'half_true_count',
            'mostly_true_count', 'pants_on_fire_count', 'context']

    liar_parts = []
    for part in ['train', 'test', 'valid']:
        df = pd.read_csv(f'../data/LIAR/{part}.tsv', sep='\t', header=None, names=cols)
        df = df[['statement', 'label']]
        liar_parts.append(df)

    liar_df = pd.concat(liar_parts).rename(columns={'statement': 'text'})
    liar_df['original_label'] = liar_df['label']
    liar_df['label'] = liar_df['label'].map(lambda x: 1 if x in ['true', 'mostly-true'] else 0)
    liar_df['dataset'] = 'LIAR'
    return liar_df[['text', 'label', 'original_label', 'dataset']]

In [7]:
load_liar_data()

Unnamed: 0,text,label,original_label,dataset
0,Says the Annies List political group supports ...,0,false,LIAR
1,When did the decline of coal start? It started...,0,half-true,LIAR
2,"Hillary Clinton agrees with John McCain ""by vo...",1,mostly-true,LIAR
3,Health care reform legislation is likely to ma...,0,false,LIAR
4,The economic turnaround started at the end of ...,0,half-true,LIAR
...,...,...,...,...
1279,"For the first time in more than a decade, impo...",0,half-true,LIAR
1280,Says Donald Trump has bankrupted his companies...,1,mostly-true,LIAR
1281,"John McCain and George Bush have ""absolutely n...",1,true,LIAR
1282,A new poll shows 62 percent support the presid...,0,false,LIAR


# 4. Load ISOT Dataset

In [None]:
def load_isot_data():
    real_df = pd.read_csv("../data/ISOT/True.csv")
    fake_df = pd.read_csv("../data/ISOT/Fake.csv")

    real_df['label'] = 1
    fake_df['label'] = 0

    isot_df = pd.concat([real_df[['text', 'label']], fake_df[['text', 'label']]])
    isot_df['original_label'] = isot_df['label']
    isot_df['dataset'] = 'ISOT'
    return isot_df[['text', 'label', 'original_label', 'dataset']]

In [12]:
load_isot_data()

Unnamed: 0,text,label,original_label,dataset
0,WASHINGTON (Reuters) - The head of a conservat...,1,1,ISOT
1,WASHINGTON (Reuters) - Transgender people will...,1,1,ISOT
2,WASHINGTON (Reuters) - The special counsel inv...,1,1,ISOT
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1,1,ISOT
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1,1,ISOT
...,...,...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,0,0,ISOT
23477,21st Century Wire says It s a familiar theme. ...,0,0,ISOT
23478,Patrick Henningsen 21st Century WireRemember ...,0,0,ISOT
23479,21st Century Wire says Al Jazeera America will...,0,0,ISOT


# 5. Load FakeNewsNet Dataset

### Note : This is the minimilistic version of dataset for text analysis 

In [18]:
def load_minimal_fakenewsnet():
    real_pol = pd.read_csv("../data/FakeNewsNet/politifact_real.csv")
    fake_pol = pd.read_csv("../data/FakeNewsNet/politifact_fake.csv")
    real_gos = pd.read_csv("../data/FakeNewsNet/gossipcop_real.csv")
    fake_gos = pd.read_csv("../data/FakeNewsNet/gossipcop_fake.csv")

    real_pol['label'] = 1
    fake_pol['label'] = 0
    real_gos['label'] = 1
    fake_gos['label'] = 0

    fakenewsnet_df = pd.concat([
        real_pol[['title', 'label']].rename(columns={'title': 'text'}),
        fake_pol[['title', 'label']].rename(columns={'title': 'text'}),
        real_gos[['title', 'label']].rename(columns={'title': 'text'}),
        fake_gos[['title', 'label']].rename(columns={'title': 'text'})
    ])
    fakenewsnet_df['original_label'] = fakenewsnet_df['label']
    fakenewsnet_df['dataset'] = 'FakeNewsNet_Minimal'
    return fakenewsnet_df


In [20]:
load_minimal_fakenewsnet()

Unnamed: 0,text,label,original_label,dataset
0,National Federation of Independent Business,1,1,FakeNewsNet_Minimal
1,comments in Fayetteville NC,1,1,FakeNewsNet_Minimal
2,"Romney makes pitch, hoping to close deal : Ele...",1,1,FakeNewsNet_Minimal
3,Democratic Leaders Say House Democrats Are Uni...,1,1,FakeNewsNet_Minimal
4,"Budget of the United States Government, FY 2008",1,1,FakeNewsNet_Minimal
...,...,...,...,...
5318,September 11: Celebrities Remember 9/11 (TWEETS),0,0,FakeNewsNet_Minimal
5319,NASCAR owners threaten to fire drivers who pro...,0,0,FakeNewsNet_Minimal
5320,The 7 signs that David Beckham is definitely h...,0,0,FakeNewsNet_Minimal
5321,Ryan Gosling and Eva Mendes Did Not Get Marrie...,0,0,FakeNewsNet_Minimal


# 6. Combine All datasets, LIAR, ISOT, and FakeNewsNet

In [25]:
def combine_datasets():
    liar = load_liar_data()
    isot = load_isot_data()
    fakenewsnet = load_minimal_fakenewsnet()

    combined_df = pd.concat([liar, isot, fakenewsnet], ignore_index=True)
    combined_df = combined_df.dropna(subset=['text'])
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Output folder
    output_dir = "../data/interim"
    os.makedirs(output_dir, exist_ok=True)

    # Save the file
    output_path = os.path.join(output_dir, "combined_fake_news_dataset.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"Combined dataset saved to: {output_path}")
    print(f"Total samples: {len(combined_df)}")

In [26]:
combine_datasets()

Combined dataset saved to: ../data/interim\combined_fake_news_dataset.csv
Total samples: 80885
