In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
csv_path = r'H:\\AIML_project\\Datasets\\FakeNewsCorpus\\news.csv\\news_cleaned_2018_02_13.csv'  # your path
usecols = ['title', 'content', 'type']
# Fast sample peek
df_sample = pd.read_csv(csv_path, nrows=20000, usecols=usecols, low_memory=False, on_bad_lines='skip')

df_sample.head(10)

Unnamed: 0,type,content,title
0,rumor,"Life is an illusion, at least on a quantum lev...",Is life an ILLUSION? Researchers prove 'realit...
1,hate,"Unfortunately, he hasn’t yet attacked her for ...",Donald Trump
2,hate,The Los Angeles Police Department has been den...,Donald Trump
3,hate,The White House has decided to quietly withdra...,"MORE WINNING! Israeli intelligence source, DEB..."
4,hate,“The time has come to cut off the tongues of t...,"“Oh, Trump, you coward, you just wait, we will..."
5,hate,The Central American nation and six other stat...,Following Guatemala’s decision to move its emb...
6,unreliable,UN-Backed Police Massacred Haitians With Impun...,UN-Backed Police Massacred Haitians With Impunity
7,unreliable,It should have come as no surprise that the sa...,Black Agenda Report
8,unreliable,"“When the police finally left the campus, arou...",Black Agenda Report
9,unreliable,Zambia Must Clarify Whether It Will Host Israe...,Zambia Must Clarify Whether It Will Host Israe...


In [3]:
df_sample['title'][0]

"Is life an ILLUSION? Researchers prove 'reality doesn't exist if you're not looking at it'"

In [4]:
df_sample['content'][0]

'Life is an illusion, at least on a quantum level, in a theory which has recently been confirmed by a set of researchers.\n\nThey finally have the means to test John Wheeler’s delayed-choice theory and concluded that the physicist was right.\n\nIn 1978, Mr Wheeler’s proposed experiment involved a moving object that was given the choice to act like a wave or a particle – the former acting as a vibration with a frequency that can distinguish it from other waves and the latter having no frequency that you can determine its position in space, unlike a wave – and at what point does it ‘decide’ to act like one or the other.\n\nAt the time, the technology was not available to conduct a strong experiment, but scientists have now been able to carry it out.'

In [5]:
import csv
import sys
# Increase CSV field size limit (critical fix!)
csv.field_size_limit(10_000_000)  # 10 MB per field - adjust higher if still fails
# Chunk size - safe for 32 GB RAM
chunk_size = 150000
# Classes to keep from FakeNewsCorpus
fake_types = ['fake', 'satire']      # label = 1
real_types = ['reliable']            # label = 0

# Temporary files (will be deleted later)
temp_fake_corpus = r'H:\\AIML_project\\temp_fake_corpus.csv'
temp_real_corpus = r'H:\\AIML_project\\temp_real_corpus.csv'

In [6]:
# ────────────────────────────────────────────────
# PART 1: Filter FakeNewsCorpus in chunks
# ────────────────────────────────────────────────

print("Filtering FakeNewsCorpus...")

fake_chunks = []
real_chunks = []

for chunk in tqdm(pd.read_csv(csv_path,
                              chunksize=chunk_size,
                              usecols=['title', 'content', 'type'],
                              
                              on_bad_lines='skip',
                              engine='python')):

    # Combine title + content (same as you did for ISOT/WELFake)
    chunk['text'] = (chunk['title'].fillna('') + " " + chunk['content'].fillna('')).str.strip()
    chunk['text'] = chunk['text'].str[:3500]  # reasonable cap

    # Fake / Satire
    fake = chunk[chunk['type'].str.lower().isin(fake_types)].copy()
    if not fake.empty:
        fake['label'] = 1
        fake_chunks.append(fake[['text', 'label']])

    # Reliable
    real = chunk[chunk['type'].str.lower().isin(real_types)].copy()
    if not real.empty:
        real['label'] = 0
        real_chunks.append(real[['text', 'label']])

# Save filtered parts (to avoid RAM explosion)
pd.concat(fake_chunks, ignore_index=True).to_csv(temp_fake_corpus, index=False)
pd.concat(real_chunks, ignore_index=True).to_csv(temp_real_corpus, index=False)

print(f"FakeNewsCorpus filtered:")
print(f"  Fake/Satire rows: {len(fake_chunks) * chunk_size:,} (approx)")
print(f"  Reliable rows:    {len(real_chunks) * chunk_size:,} (approx)")

Filtering FakeNewsCorpus...


57it [04:50,  5.09s/it]


FakeNewsCorpus filtered:
  Fake/Satire rows: 6,750,000 (approx)
  Reliable rows:    8,100,000 (approx)


In [7]:
# ────────────────────────────────────────────────
# PART 2: Load & Prepare ISOT + WELFake
# ────────────────────────────────────────────────

# ISOT (your labels are already correct)
print("Loading ISOT...")
isot_fake = pd.read_csv('https://media.githubusercontent.com/media/Gyaanendra/SML-Project-cset211/refs/heads/main/raw_data/Fake.csv')
isot_fake['label'] = 1  # fake = 1

isot_true = pd.read_csv('https://media.githubusercontent.com/media/Gyaanendra/SML-Project-cset211/refs/heads/main/raw_data/True.csv')
isot_true['label'] = 0  # real = 0

isot = pd.concat([isot_fake[['title','text','label']], isot_true[['title','text','label']]], ignore_index=True)

# WELFake - FIX labels
print("\nLoading & fixing WELFake...")
wel = pd.read_csv('https://media.githubusercontent.com/media/Gyaanendra/SML-Project-cset211/refs/heads/main/raw_data/WELFake_Dataset.csv')
wel = wel.drop(columns=['Unnamed: 0'], errors='ignore')
wel['label'] = wel['label'].map({0: 0, 1: 1})   # 0→real=0, 1→fake=1 (now matches ISOT convention)

wel['text'] = (wel['title'].fillna('') + " " + wel['text'].fillna('')).str.strip()

# Combine ISOT + WELFake
small_combined = pd.concat([isot[['text','label']], wel[['text','label']]], ignore_index=True)
small_combined = small_combined.drop_duplicates(subset='text').dropna(subset=['text','label'])

small_combined.to_csv(r'H:\\AIML_project\\Datasets\\kaggle\\isot_welfake_correct_labels.csv', index=False)

print("\nSmall combined (ISOT + WELFake):")
print(small_combined.shape)
print(small_combined['label'].value_counts(normalize=True).round(3)*100)

Loading ISOT...

Loading & fixing WELFake...

Small combined (ISOT + WELFake):
(102322, 2)
label
0    54.7
1    45.3
Name: proportion, dtype: float64


In [8]:
# ────────────────────────────────────────────────
# PART 3: Final Merge
# ────────────────────────────────────────────────
import os
print("\nMerging all...")
existing = pd.read_csv("H:\\AIML_project\\Datasets\\kaggle\\isot_welfake_correct_labels.csv")
fnc_fake = pd.read_csv(temp_fake_corpus)
fnc_real = pd.read_csv(temp_real_corpus)
fnc = pd.concat([fnc_fake, fnc_real], ignore_index=True)

final_df = pd.concat([existing, fnc], ignore_index=True)

# Clean up
final_df = final_df.drop_duplicates(subset='text', keep='first')
final_df = final_df.dropna(subset=['text', 'label'])
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

output_path = r'H:\\AIML_project\\Datasets\\meraged\\final_combined_corpus.csv'
final_df.to_csv(output_path, index=False)

print("\nSuccess!")
print(f"Final CSV: {output_path}")
print(f"Total rows: {len(final_df):,}")
print("Label distribution:")
print(final_df['label'].value_counts(normalize=True).round(3) * 100)

# Cleanup temp files (optional)
os.remove(temp_fake_corpus)
os.remove(temp_real_corpus)
print("Temp files removed.")


Merging all...

Success!
Final CSV: H:\\AIML_project\\Datasets\\meraged\\final_combined_corpus.csv
Total rows: 2,930,584
Label distribution:
label
0    65.1
1    34.9
Name: proportion, dtype: float64
Temp files removed.
