In [None]:
import sys
from pathlib import Path
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Ensure src modules are importable
src_path = str((Path.cwd().parent / 'src').resolve())
if src_path not in sys.path:
    sys.path.append(src_path)
    
from src.utils import basic_clean, preprocess_text

In [3]:
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [4]:
data_dir = (Path.cwd().parent / 'data' / 'raw').resolve()
csv_path = data_dir / 'WELFake_Dataset.csv'

news = pd.read_csv(csv_path)
news.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1


In [5]:
# Clean and dedupe rows
news.drop('Unnamed: 0',axis=1,inplace=True)
news.rename(columns={"label":"is_fake"},inplace=True)

news.dropna(inplace=True)
news.drop_duplicates(inplace=True)
news.reset_index(drop=True, inplace=True)
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63121 entries, 0 to 63120
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    63121 non-null  object
 1   text     63121 non-null  object
 2   is_fake  63121 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [6]:
news["raw_text"] = news["title"] + " " + news["text"]
news["clean_text"] = news["raw_text"].apply(basic_clean)
news["processed_text"] = news["clean_text"].apply(preprocess_text)
news.head(3)

Unnamed: 0,title,text,is_fake,raw_text,clean_text,processed_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,law enforcement on high alert following threat...,law enforcement high alert following threat co...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,unbelievable obama s attorney general says mos...,unbelievable obama attorney general say charlo...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,"Bobby Jindal, raised Hindu, uses story of Chri...",bobby jindal raised hindu uses story of christ...,bobby jindal raised hindu us story christian c...


In [8]:
train, temp = train_test_split(
    news,
    test_size=0.30,
    stratify=news['is_fake'],
    random_state=SEED
)
val, test = train_test_split(
    temp,
    test_size=0.50,
    stratify=temp['is_fake'],
    random_state=SEED
)
print("Sizes:", train.shape, val.shape, test.shape)

Sizes: (44184, 6) (9468, 6) (9469, 6)


In [9]:
# Save splits to CSV
for split, name in [(train, "train"), (val, "val"), (test, "test")]:
    out = split[["raw_text", "clean_text", "processed_text", "is_fake"]]
    out.to_csv(f"../data/processed/{name}.csv", index=False)

print("Sizes:", train.shape, val.shape, test.shape)

Sizes: (44184, 6) (9468, 6) (9469, 6)
