In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
hfdnd = pd.read_csv("../dataset/dataset-merged (HFDND).csv")
ifnd  = pd.read_csv("../dataset/IFND.csv", encoding="latin1")
liar_columns = [
    'json_id', 'label', 'statement', 'subject', 'speaker', 
    'job_title', 'state', 'party', 'barely_true', 'false', 
    'half_true', 'mostly_true', 'pants_on_fire', 'context'
]

liar_train = pd.read_csv("../dataset/LIAR_train.tsv", sep="\t", names=liar_columns)
liar_valid = pd.read_csv("../dataset/LIAR_valid.tsv", sep="\t", names=liar_columns)
liar_test  = pd.read_csv("../dataset/LIAR_test.tsv", sep="\t", names=liar_columns)

In [3]:
print("HFDND Dataset Shape: ",hfdnd.shape)
print("IFND Dataset Shape: ",ifnd.shape)
print("LIAR Dataset Shape: ",liar_train.shape)

HFDND Dataset Shape:  (17124, 4)
IFND Dataset Shape:  (56714, 7)
LIAR Dataset Shape:  (10240, 14)


In [4]:
# 1. Inspect Column Names
print("HFDND Columns:", hfdnd.columns.tolist())
print("IFND Columns:", ifnd.columns.tolist())
print("LIAR Columns:", liar_train.columns.tolist())

HFDND Columns: ['Unnamed: 0', 'text', 'label', 'wcount']
IFND Columns: ['id', 'Statement', 'Image', 'Web', 'Category', 'Date', 'Label']
LIAR Columns: ['json_id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state', 'party', 'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'context']


In [5]:
hfdnd_df = hfdnd[["text", "label"]].copy()
hfdnd_df["language"] = "hi"


print("HFDND Preview:\n", hfdnd_df.head(2))

HFDND Preview:
                                                 text  label language
0  ‘मोदी के शासन के दौरान गंगा’  गंगा नदी नरेन्द्...      1       hi
1  यह खबर आने से पहले छवि क्रेडिट जस्टिन सुलिवान/...      1       hi


In [6]:
ifnd_df = ifnd[["Statement", "Label"]].copy()
ifnd_df.columns = ["text", "label"]

# Map TRUE/FALSE → 0/1
ifnd_df["label"] = ifnd_df["label"].map({
    "TRUE": 0,
    "Fake": 1
})

ifnd_df["language"] = "en"
print("\nIFND Preview:\n", ifnd_df.head(2))


IFND Preview:
                                                 text  label language
0  WHO praises India's Aarogya Setu app, says it ...      0       en
1  In Delhi, Deputy US Secretary of State Stephen...      0       en


In [7]:
# Real = 0, Fake = 1
def map_liar_to_binary(label):
    if label in ['true', 'mostly-true']:
        return 0
    elif label in ['false', 'barely-true', 'pants-fire']:
        return 1
    else:
        return None

liar_train['label_binary'] = liar_train['label'].apply(map_liar_to_binary)

liar_train_clean = liar_train.dropna(subset=['label_binary']).copy()
liar_train_clean['label_binary'] = liar_train_clean['label_binary'].astype(int)

print("LIAR Binary Label Distribution:")
print(liar_train_clean['label_binary'].value_counts())

LIAR Binary Label Distribution:
label_binary
1    4488
0    3638
Name: count, dtype: int64


In [8]:
# LIAR Pre-processing (English)
liar_df = liar_train_clean[['statement', 'label_binary']].copy()
liar_df.columns = ['text', 'label']
liar_df['language'] = 'en'
print("\nLIAR Preview:\n", liar_df.head(2))


LIAR Preview:
                                                 text  label language
0  Says the Annies List political group supports ...      1       en
2  Hillary Clinton agrees with John McCain "by vo...      0       en


In [9]:
# Combine all processed datasets into a single multilingual frame
unified_df = pd.concat([hfdnd_df, ifnd_df, liar_df], ignore_index=True)

print("Unified Dataset Shape:", unified_df.shape)
print("Language Counts:\n", unified_df["language"].value_counts())
unified_df.head()

Unified Dataset Shape: (81964, 3)
Language Counts:
 language
en    64840
hi    17124
Name: count, dtype: int64


Unnamed: 0,text,label,language
0,‘मोदी के शासन के दौरान गंगा’ गंगा नदी नरेन्द्...,1,hi
1,यह खबर आने से पहले छवि क्रेडिट जस्टिन सुलिवान/...,1,hi
2,गुलाब गेंद वाल डे-नाइट टेस्ट मैच कप्ता विराट क...,0,hi
3,उत्तर कोरिया रॉकेट प्रक्षेपण योजनाएं 71 0 15 0...,1,hi
4,राष्ट्रपति डोनाल्ड ट्रम्प और प्रथम महिला मेलान...,0,hi


In [10]:
# Persist unified dataset for downstream tasks
output_path = "../dataset/unified_dataset.csv"
unified_df.to_csv(output_path, index=False)
print(f"Unified dataset saved to {output_path}")

Unified dataset saved to ../dataset/unified_dataset.csv
