In [356]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split


In [357]:
train_path = "../data/train2.tsv"
val_path = "../data/val2.tsv"
test_path = "../data/test2.tsv"

In [358]:
cols = [
"index", "id", "label", "statement", "subject", "speaker", "job", "state",
"party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire",
"context", "justification"
]


In [359]:
dfs =[]

for path in [train_path, val_path, test_path]:
    # Load TSV file without header
    df = pd.read_csv(path, sep="\t", header=None)
    df.columns = cols
    
    # Remove unnecessary index column
    df = df.drop(columns=["index"])
    
    # Clean ID field by removing .json extension
    df["id"] = df["id"].str.replace(".json", "", regex=False)
    
    # Strip whitespace from all text columns
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].str.strip()
        
    # Reset index for clean indexing
    df.reset_index(drop=True, inplace=True)
    dfs.append(df)

In [360]:
dfs[0].shape

(10240, 15)

In [361]:
dfs[1].shape

(1284, 15)

In [362]:
dfs[2].shape

(1267, 15)

In [363]:
liarplus = pd.concat(dfs, ignore_index=True)

In [364]:
clean_liarplus = liarplus.drop(columns = ['subject', 'state', 'party'])

In [365]:
scraped_politifact = pd.read_csv("../data/politifact.csv")

In [366]:
scraped_politifact.columns

Index(['Unnamed: 0', 'speaker', 'quote', 'rating', 'date', 'context',
       'justification', 'link', 'party_affiliation', 'true_count',
       'mostly_true_count', 'half_true_count', 'mostly_false_count',
       'false_count', 'pants_on_fire_count'],
      dtype='object')

In [367]:
renamed_cols = [
"id", "speaker", 'statement', "label", 'date', "context", 'justification', 'link', 'party_affiliation', 
'true','mostly_true', 'half_true', 'mostly_false', 'false', 'pants_on_fire'
]

In [368]:
scraped_politifact.columns = renamed_cols

In [369]:
clean_politifact = scraped_politifact.drop(columns=['date', 'link', 'party_affiliation'])

In [370]:
def normalize_speaker(name):
    if pd.isna(name):
        return ""
    name = name.lower()
    # replace hyphens with spaces: "donald-trump" â†’ "donald trump"
    name = name.replace("-", " ")
    # remove punctuation
    name = re.sub(r"[^\w\s]", "", name)
    # collapse multiple spaces
    name = re.sub(r"\s+", " ", name)
    return name.strip()

In [371]:
clean_liarplus["speaker"] = clean_liarplus["speaker"].apply(normalize_speaker)
clean_politifact["speaker"] = clean_politifact["speaker"].apply(normalize_speaker)


In [372]:
job_lookup = clean_liarplus[["speaker", "job"]].drop_duplicates()
clean_politifact = clean_politifact.merge(job_lookup, on="speaker", how="left")


In [373]:
clean_politifact["job"] = clean_politifact["job"].fillna("Unknown")


In [374]:
clean_politifact["barely_true"] = 0


In [375]:
clean_politifact.shape

(570, 14)

In [376]:
clean_liarplus["true"] = 0
clean_liarplus["mostly_false"] = 0


In [377]:
clean_liarplus.shape

(12791, 14)

In [378]:
clean_politifact.columns

Index(['id', 'speaker', 'statement', 'label', 'context', 'justification',
       'true', 'mostly_true', 'half_true', 'mostly_false', 'false',
       'pants_on_fire', 'job', 'barely_true'],
      dtype='object')

In [379]:
clean_liarplus.columns

Index(['id', 'label', 'statement', 'speaker', 'job', 'barely_true', 'false',
       'half_true', 'mostly_true', 'pants_on_fire', 'context', 'justification',
       'true', 'mostly_false'],
      dtype='object')

In [380]:
final_cols = [
    "id", "label", "statement", "speaker", 'job',
    "true", "mostly_true", "half_true", "barely_true",
    "mostly_false", "false", "pants_on_fire",
    "context", "justification"
]



In [381]:
final_liarplus = clean_liarplus[final_cols]
final_politifact = clean_politifact[final_cols]


In [382]:
final_liarplus.shape

(12791, 14)

In [383]:
final_politifact.shape

(570, 14)

In [384]:
final_df = pd.concat([final_liarplus, final_politifact], ignore_index=True)
print(final_df.shape)

(13361, 14)


In [385]:
final_df.head()

Unnamed: 0,id,label,statement,speaker,job,true,mostly_true,half_true,barely_true,mostly_false,false,pants_on_fire,context,justification
0,2635,false,Says the Annies List political group supports ...,dwayne bohac,State representative,0.0,0.0,0.0,0.0,0.0,1.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540,half-true,When did the decline of coal start? It started...,scott surovell,State delegate,0.0,1.0,1.0,0.0,0.0,0.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",barack obama,President,0.0,163.0,160.0,70.0,0.0,71.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123,false,Health care reform legislation is likely to ma...,blog posting,,0.0,5.0,3.0,7.0,0.0,19.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028,half-true,The economic turnaround started at the end of ...,charlie crist,,0.0,19.0,20.0,15.0,0.0,9.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [386]:
final_df['label'].value_counts()

label
false          2874
half-true      2653
mostly-true    2482
barely-true    2156
true           2063
pants-fire     1127
full-flop         4
half-flip         2
Name: count, dtype: int64

In [387]:
train_df, temp_df = train_test_split(
    final_df,
    test_size=0.2,
    random_state=32,
    shuffle=True,
    stratify=final_df["label"]
)

In [388]:
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    shuffle=True,
)


In [389]:
train_df.to_csv("../data/train_set.csv", index=False)
val_df.to_csv("../data/val_set.csv", index=False)
test_df.to_csv("../data/test_set.csv", index=False)