In [1]:
import pandas as pd
# Load the datasets
df_fake = pd.read_csv("data/raw/Fake.csv")
df_real = pd.read_csv("data/raw/True.csv")

# Add labels
df_fake['label'] = 'fake'
df_real['label'] = 'real'

# Preview
print("Fake news sample:")
display(df_fake.head())

print("Real news sample:")
display(df_real.head())


Fake news sample:


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",fake


Real news sample:


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",real


In [32]:
# Combine datasets
df = pd.concat([df_fake, df_real], ignore_index=True)

# Combine title and text into one column
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['full_text'] = df['title'] + ' ' + df['text']

# Keep only relevant columns
df_final = df[['full_text', 'label']]
df_final.columns = ['text', 'label']

# Add source column
df_final['source'] = 'kaggle'

# Shuffle the data
data_1 = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Preview
data_1.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['source'] = 'kaggle'


Unnamed: 0,text,label,source
0,Ben Stein Calls Out 9th Circuit Court: Committ...,fake,kaggle
1,Trump drops Steve Bannon from National Securit...,real,kaggle
2,Puerto Rico expects U.S. to lift Jones Act shi...,real,kaggle
3,OOPS: Trump Just Accidentally Confirmed He Le...,fake,kaggle
4,Donald Trump heads for Scotland to reopen a go...,real,kaggle


In [33]:
print(df_real['subject'].unique())
df_fake['subject'].unique()


['politicsNews' 'worldnews']


array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [2]:
# Combine datasets
df = pd.concat([df_fake, df_real], ignore_index=True)

# Combine title and text into one column
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')
df['full_text'] = df['title'] + ' ' + df['text']

# Keep only relevant columns
df_final = df[['full_text', 'subject']]
df_final.columns = ['text', 'subject']

# Add source column
df_final['source'] = 'kaggle'

# Shuffle the data
data_2 = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Preview
data_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['source'] = 'kaggle'


Unnamed: 0,text,subject,source
0,Ben Stein Calls Out 9th Circuit Court: Committ...,US_News,kaggle
1,Trump drops Steve Bannon from National Securit...,politicsNews,kaggle
2,Puerto Rico expects U.S. to lift Jones Act shi...,politicsNews,kaggle
3,OOPS: Trump Just Accidentally Confirmed He Le...,News,kaggle
4,Donald Trump heads for Scotland to reopen a go...,politicsNews,kaggle


In [34]:
# Save to processed folder
data_1.to_csv("data/processed/kaggle_clean.csv", index=False)
print("Cleaned Kaggle dataset saved successfully.")



Cleaned Kaggle dataset saved successfully.


In [6]:
# Save to processed folder
data_2.to_csv("data/processed/kaggle_clean.csv_2", index=False)
print("Cleaned Kaggle dataset saved successfully.")
data_2.info()

Cleaned Kaggle dataset saved successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     44898 non-null  object
 1   subject  44898 non-null  object
 2   source   44898 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [27]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   label   44898 non-null  object
 2   source  44898 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [28]:
!pip install newspaper3k




In [29]:
!pip install lxml_html_clean




In [30]:
from newspaper import Article


In [1]:
import pandas as pd

# Load first 200 rows for testing (adjust for full later)
df = pd.read_csv("data/raw/gossipcop_real.csv")

# --- Clean malformed or blank URLs ---
def clean_url(url):
    if pd.isna(url) or not isinstance(url, str) or len(url.strip()) < 10:
        return None
    url = url.strip()
    if not url.startswith("http"):
        url = "https://" + url.lstrip(":/")
    return url

df['clean_url'] = df['news_url'].apply(clean_url)
df = df.dropna(subset=['clean_url'])

# --- Filter Out Non-Responsive or Low-Value Domains ---
blocklist = [
    'wikipedia.org', 'quora.com', 'msn.com', 'kisscasper.com', 'dadli.mobi',
    'longroom.com', 'celebrityinsider.org', 'gossipbucket.com', 'newmediasearch.com',
    'statista.com', 'refinery29.com/en-us/2017', 'trueara.com', 'pennews.pencidesign.com',
    'medium.com/@AndreAguirre25111NTZ', 'article.wn.com/view/2017', 'storiesflow.com',
    'thisisinsider.com', 'nickiswift.com', 'longroom.com', 'cmch.tv', 'statista.com',
    'acriticalreviewofthehelp.wordpress.com'
]

# Remove URLs containing any blocked domain
pattern = '|'.join(blocklist)
df = df[~df['clean_url'].str.contains(pattern, na=False)]

# Save for next stage
df.to_csv("data/processed/gossipcop_urls_step1_cleaned.csv", index=False)
print(f"✅ Step 1 done. {len(df)} rows survived → gossipcop_urls_step1_cleaned.csv")


✅ Step 1 done. 15001 rows survived → gossipcop_urls_step1_cleaned.csv


In [2]:
import pandas as pd
from newspaper import Article
from tqdm import tqdm

# Load filtered URLs
df = pd.read_csv("data/processed/gossipcop_urls_step1_cleaned.csv").head(5000)
df = df.copy()

# Extract full article text
def extract_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip()
    except:
        return None

tqdm.pandas(desc="Extracting text")
df['text'] = df['clean_url'].progress_apply(extract_text)

# Filter out rows with very short or empty text
df = df[df['text'].str.len() >= 100]

# Save final file
df.to_csv("data/processed/gossipcop_final_cleaned_real.csv", index=False)
print(f"✅ Step 2 done. {len(df)} rows saved to gossipcop_final_cleaned_real.csv")


Extracting text: 100%|███████████████████████████████████████████████████████████| 5000/5000 [1:47:59<00:00,  1.30s/it]

✅ Step 2 done. 1567 rows saved to gossipcop_final_cleaned_real.csv





In [3]:
import pandas as pd
from newspaper import Article
from tqdm import tqdm

# Load filtered URLs
df = pd.read_csv("data/processed/gossipcop_urls_step1_cleaned.csv").iloc[5000:10000].copy()
df = df.copy()

# Extract full article text
def extract_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip()
    except:
        return None

tqdm.pandas(desc="Extracting text")
df['text'] = df['clean_url'].progress_apply(extract_text)

# Filter out rows with very short or empty text
df = df[df['text'].str.len() >= 100]

# Save final file
df.to_csv("data/processed/gossipcop_final_cleaned_real_1.csv", index=False)
print(f"✅ Step 2 done. {len(df)} rows saved to gossipcop_final_cleaned_real_2.csv")


Extracting text: 100%|███████████████████████████████████████████████████████████| 5000/5000 [4:07:04<00:00,  2.96s/it]


✅ Step 2 done. 3302 rows saved to gossipcop_final_cleaned_real_2.csv


In [5]:
import pandas as pd
from newspaper import Article
from tqdm import tqdm

# Load filtered URLs
df = pd.read_csv("data/processed/gossipcop_urls_step1_cleaned.csv").iloc[10000:15001].copy()
df = df.copy()

# Extract full article text
def extract_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text.strip()
    except:
        return None

tqdm.pandas(desc="Extracting text")
df['text'] = df['clean_url'].progress_apply(extract_text)

# Filter out rows with very short or empty text
df = df[df['text'].str.len() >= 100]

# Save final file
df.to_csv("data/processed/gossipcop_final_cleaned_real_2.csv", index=False)
print(f"✅ Step 2 done. {len(df)} rows saved to gossipcop_final_cleaned_real_2.csv")


Extracting text:  46%|██████████████████████████▍                              | 2324/5001 [2:13:54<2:24:49,  3.25s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Extracting text: 100%|███████████████████████████████████████████████████████████| 5001/5001 [4:24:13<00:00,  3.17s/it]


✅ Step 2 done. 3214 rows saved to gossipcop_final_cleaned_real_2.csv


In [None]:
from newspaper import Article
import pandas as pd
import time

# Load full dataset
df = pd.read_csv("data/raw/gossipcop_fake.csv")

# Step 1: Clean and standardize URLs
def clean_url(url):
    if pd.isna(url) or not isinstance(url, str):
        return None
    url = url.strip()
    if not url.startswith("http"):
        url = "https://" + url.lstrip(":/")
    return url

df['clean_url'] = df['news_url'].apply(clean_url)
blacklist = ['yournewswire.com', 'en.wikipedia.org', 'msn.com']
df = df.dropna(subset=['clean_url'])
df = df[~df['clean_url'].str.contains('|'.join(blacklist), na=False)]

# Step 2: Extract meaningful text
def extract_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.strip()
        if len(text) < 200 or text.lower().startswith(("updated", "click here")):
            return None
        return text
    except:
        return None

df['text'] = df['clean_url'].apply(lambda url: extract_text(url))
df = df.dropna(subset=['text'])
df = df.drop_duplicates(subset='title')

# Step 3: Save final cleaned dataset
df.to_csv("data/processed/fakenewsnet_gossipcop_cleaned.csv", index=False)
print(f"✅ Done. Saved: {len(df)} rows → fakenewsnet_gossipcop_cleaned.csv")


In [3]:
import pandas as pd

df_1 = pd.read_csv("data/processed/gossipcop_final_cleaned_real.csv", encoding='latin1')
df_2 = pd.read_csv("data/processed/gossipcop_final_cleaned_real_1.csv", encoding='latin1')
df_3 = pd.read_csv("data/processed/gossipcop_final_cleaned_real_2.csv", encoding='latin1')
df_0 = pd.read_csv("data/processed/fakenewsnet_gossipcop_cleaned.csv", encoding='latin1')


# Add labels
df_1["label"] = "real"
df_2["label"] = "real"
df_3["label"] = "real"
df_0["label"] = "fake"

# Combine all
df_final = pd.concat([df_1, df_2, df_3, df_0], ignore_index=True)

# Shuffle the rows
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the final file
df_final.to_csv("data/processed/gossipcop_final.csv", index=False)

print(f"✅ Combined and saved. Total rows: {len(df_final)}")



✅ Combined and saved. Total rows: 10741


In [4]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10741 entries, 0 to 10740
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         10741 non-null  object
 1   news_url   10741 non-null  object
 2   title      10740 non-null  object
 3   tweet_ids  10206 non-null  object
 4   clean_url  10738 non-null  object
 5   text       10738 non-null  object
 6   label      10741 non-null  object
dtypes: object(7)
memory usage: 587.5+ KB


In [13]:
import pandas as pd

# Load the file
df = pd.read_csv("data/processed/gossipcop_final.csv", encoding="latin1")

# Drop unnecessary columns
df = df.drop(columns=["id", "news_url", "tweet_ids", "clean_url"])

# Drop rows where 'text' is null
df = df.dropna(subset=["text"])

# Add subject column
df["subject"] = "entertainment"

# Save the cleaned file
df.to_csv("data/processed/gossipcop_final_labeled.csv", index=False)

print(f"✅ Cleaned and saved. Final row count: {len(df)}")


df.info()


✅ Cleaned and saved. Final row count: 10738
<class 'pandas.core.frame.DataFrame'>
Index: 10738 entries, 0 to 10740
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    10738 non-null  object
 1   text     10738 non-null  object
 2   label    10738 non-null  object
 3   subject  10738 non-null  object
dtypes: object(4)
memory usage: 419.5+ KB


In [4]:
import pandas as pd

# Load dataset
data_1 = pd.read_csv("data/processed/clean_useabel_file_ gossipcop_ lebled.csv")

# Full cleanup
data_1 = data_1.dropna()
data_1 = data_1[data_1['text'].str.len() >= 100]
data_1 = data_1.drop_duplicates(subset='text')

# Fix index
data_1 = data_1.reset_index(drop=True)

print(f"✅ Cleaned and saved. Final row count: {len(data_1)}")
data_1.info()


✅ Cleaned and saved. Final row count: 9879
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9879 non-null   object
 1   text     9879 non-null   object
 2   label    9879 non-null   object
 3   subject  9879 non-null   object
dtypes: object(4)
memory usage: 308.8+ KB


In [37]:
# Combine title and text into one column
data_1["text"] = data_1["title"].astype(str) + ". " + data_1["text"].astype(str)

# Drop the original title and subject columns
data_1.drop(columns=["title", "subject"], inplace=True)

# Add a source column
data_1["source"] = "gossipcop"

# Save the cleaned and formatted version
output_path = "data/processed/gossipcop_formatted_for_model.csv"
data_1.to_csv(output_path, index=False)

# Display final structure and count
data_1.info(), output_path


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9879 non-null   object
 1   label   9879 non-null   object
 2   source  9879 non-null   object
dtypes: object(3)
memory usage: 231.7+ KB


(None, 'data/processed/gossipcop_formatted_for_model.csv')

In [5]:
# Combine title and text into one column
data_1["text"] = data_1["title"].astype(str) + ". " + data_1["text"].astype(str)

# Drop the original title and subject columns
data_1.drop(columns=["title", "label"], inplace=True)

# Add a source column
data_1["source"] = "gossipcop"

# Save the cleaned and formatted version
output_path = "data/processed/gossipcop_formatted_for_model.csv_2"
data_1.to_csv(output_path, index=False)

# Display final structure and count
data_1.info(), output_path

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9879 non-null   object
 1   subject  9879 non-null   object
 2   source   9879 non-null   object
dtypes: object(3)
memory usage: 231.7+ KB


(None, 'data/processed/gossipcop_formatted_for_model.csv_2')

In [38]:
import pandas as pd

# Load both datasets
data_1 = pd.read_csv("data/processed/gossipcop_formatted_for_model.csv")
data_2 = pd.read_csv("data/processed/kaggle_clean.csv")

# Combine
combined = pd.concat([data_1, data_2], ignore_index=True)

# Optional: shuffle
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
combined.to_csv("data/processed/combined_fake_news_dataset.csv", index=False)

print(f"✅ Combined dataset saved with {len(combined)} rows.")


✅ Combined dataset saved with 54777 rows.


In [39]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54777 entries, 0 to 54776
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    54777 non-null  object
 1   label   54777 non-null  object
 2   source  54777 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [7]:
import pandas as pd

# Load both datasets
data_1 = pd.read_csv("data/processed/gossipcop_formatted_for_model.csv_2")
data_2 = pd.read_csv("data/processed/kaggle_clean.csv_2")

# Combine
combined = pd.concat([data_1, data_2], ignore_index=True)

# Optional: shuffle
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
combined.to_csv("data/processed/combined_news_type_dataset.csv", index=False)

print(f"✅ Combined dataset saved with {len(combined)} rows.")


✅ Combined dataset saved with 54777 rows.


In [8]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54777 entries, 0 to 54776
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     54777 non-null  object
 1   subject  54777 non-null  object
 2   source   54777 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB
