# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
misinfo_data = pd.read_csv("misinfo_dataset.csv")
misinfo_data.head()

Unnamed: 0,id,news_url,title,tweet_ids,misinformation_type,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,gossipcop,fake
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,gossipcop,fake
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,gossipcop,fake
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,gossipcop,fake
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,gossipcop,fake


In [3]:
misinfo_data.shape

(23196, 6)

In [4]:
misinfo_data.isnull().sum()

id                        0
news_url                330
title                     0
tweet_ids              1501
misinformation_type       0
label                     0
dtype: int64

Merge the dataset

In [20]:
import os

# Define the image folder path
image_folder = "new_datasets/compressed_deepfakes"  # Change this to your actual folder path

# Define dataset splits
splits = ["train","test", "validation"]

# Create a list to store image paths and labels
image_data = []

# Loop through each dataset split
for split in splits:
    for label in ["Real", "Fake"]:  # Assuming folders are named "Real" and "Fake"
        folder_path = os.path.join(image_folder, split, label)
        if os.path.exists(folder_path):  # Ensure folder exists
            for filename in os.listdir(folder_path):
                image_path = os.path.join(folder_path, filename)
                image_data.append({"image_path": image_path, "label": label.lower()})  # Convert to lowercase for consistency

# Convert the list into a DataFrame
image_df = pd.DataFrame(image_data)

# Load the CSV file
#csv_file = "misinfo_dataset.csv"  # Change this to your actual CSV file
csv_df = pd.read_csv("new_datasets/misinfo_dataset.csv")

# Ensure the label column is lowercase for consistency
csv_df["label"] = csv_df["label"].str.lower()

# Merge on the label column
merged_df = pd.merge(image_df, csv_df, on="label", how="inner")

# Save the merged dataset
merged_df.to_csv("merged_dataset.csv", index=False)

# Display the first few rows
print(merged_df.head())

                                          image_path label                id  \
0  new_datasets/compressed_deepfakes/train/Real/r...  real  gossipcop-882573   
1  new_datasets/compressed_deepfakes/train/Real/r...  real  gossipcop-875924   
2  new_datasets/compressed_deepfakes/train/Real/r...  real  gossipcop-894416   
3  new_datasets/compressed_deepfakes/train/Real/r...  real  gossipcop-857248   
4  new_datasets/compressed_deepfakes/train/Real/r...  real  gossipcop-884684   

                                            news_url  \
0  https://www.brides.com/story/teen-mom-jenelle-...   
1  https://www.dailymail.co.uk/tvshowbiz/article-...   
2        https://en.wikipedia.org/wiki/Quinn_Perkins   
3  https://www.refinery29.com/en-us/2018/03/19192...   
4  https://www.cnn.com/2017/10/04/entertainment/c...   

                                               title  \
0  Teen Mom Star Jenelle Evans' Wedding Dress Is ...   
1  Kylie Jenner refusing to discuss Tyga on Life ...   
2             

In [21]:
merged_df.shape

(25515600, 7)

In [24]:
merged_data = pd.read_csv("merged_dataset.csv")
merged_data.columns

Index(['image_path', 'label', 'id', 'news_url', 'title', 'tweet_ids',
       'misinformation_type'],
      dtype='object')

Data Downsampling

In [25]:
merged_data.shape

(25515600, 7)

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Set the sample size (e.g., 10% of original dataset)
sample_size = int(0.02 * len(merged_data))  # Adjust this fraction as needed

# Perform stratified sampling based on 'misinformation_type'
df_sampled, _ = train_test_split(merged_data, train_size=sample_size, stratify=merged_data['misinformation_type'], random_state=42)

# Save the reduced dataset
#df_sampled.to_csv("downsampled_dataset.csv", index=False)

print(f"Original dataset size: {len(merged_data)}")
print(f"Reduced dataset size: {len(df_sampled)}")


Original dataset size: 25515600
Reduced dataset size: 510312


In [27]:
df_sampled.head()

Unnamed: 0,image_path,label,id,news_url,title,tweet_ids,misinformation_type
3201869,new_datasets/compressed_deepfakes/train/Real/r...,real,gossipcop-903816,https://www.dailymail.co.uk/femail/article-527...,Timberland boots becoming trendy with Gigi Had...,949060933997023232\t949061822312845312\t949061...,gossipcop
8662448,new_datasets/compressed_deepfakes/train/Real/r...,real,gossipcop-922827,https://www.floor8.com/posts/8562-fans-accuse-...,Fans Accuse Jay-Z and Beyoncé of Copying Kim K...,977459151281557504,gossipcop
6253700,new_datasets/compressed_deepfakes/train/Real/r...,real,gossipcop-899993,https://okmagazine.com/photos/wags-la-wags-mia...,'WAGS LA' And 'WAGS Miami' Both Officially Can...,941404950726365184\t941405189889720320\t941405...,gossipcop
9737150,new_datasets/compressed_deepfakes/train/Fake/f...,fake,gossipcop-3271653405,www.bustle.com/p/who-is-hassan-jameel-rihannas...,Who Is Hassan Jameel? Rihanna's Mystery Man Is...,886023658271014912\t886023887804387329\t886025...,gossipcop
22131755,new_datasets/compressed_deepfakes/validation/R...,real,politifact13833,http://www.taxpolicycenter.org/sites/default/f...,Who Benefits from President Trump’s Child Care...,775842098402566144\t836687647473037316\t836719...,politifact


Clean the Dataset

In [28]:
df_sampled.isnull().sum()

image_path                 0
label                      0
id                         0
news_url                7229
title                      0
tweet_ids              32924
misinformation_type        0
dtype: int64

In [29]:
# drop rows with missing values
df_sampled.dropna(inplace=True)

In [30]:
df_sampled.isnull().sum()

image_path             0
label                  0
id                     0
news_url               0
title                  0
tweet_ids              0
misinformation_type    0
dtype: int64

In [31]:
df_sampled.shape

(471042, 7)

# EDA
## Root Cause Analysis for Misinformation

Root Cause Analysis (RCA) helps identify why misinformation is created. Here’s how you can approach it:

**Step 1: Categorize Misinformation**
- Your dataset already has a 'misinformation_type' column.

- Count occurrences of each type:


In [32]:
df_sampled['misinformation_type'].value_counts()


misinformation_type
gossipcop     454400
politifact     16642
Name: count, dtype: int64

The most common misinformation type is gossipcop with 454,400 rows.

**Step 2: Analyze the Sources of Misinformation**
- Check which news URLs appear most often:

In [33]:
df_sampled['news_url'].value_counts().head(10)

news_url
https://www.thewrap.com/this-is-us-everything-we-know-her-rebecca-future-randall-tess-beth-mother/                                     203
https://en.wikipedia.org/wiki/Wedding_of_Prince_Harry_and_Meghan_Markle                                                                185
https://www.usatoday.com/story/life/entertainthis/2018/02/06/kylie-jenner-named-her-baby-stormi-and-twitter-freaking-out/312861002/    179
https://en.wikipedia.org/wiki/Travis_Scott                                                                                             178
https://people.com/tv/why-nikki-bella-hid-her-first-marriage-from-john-cena-and-how-he-reacted-when-he-found-out/                      175
https://www.bbc.com/news/entertainment-arts-41594672                                                                                   159
https://en.wikipedia.org/wiki/2018_MTV_Movie_%26_TV_Awards                                                                             154
www.newidea.com.au


All of these websites are celebrity gossip websites

**Step 3: Keyword Analysis in Titles**

- Extract the most frequent words in misinformation titles.
- Removes stopwords using nltk.stopwords.words('english').
- Removes additional common words that might not add value.
- Finds the most frequent words while filtering out stopwords.


In [34]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Define additional common words to remove (optional)
common_words = {"news", "video", "report", "click", "link", "new", "breaking"}

# Function to extract most frequent words from a title
def extract_frequent_words(title, top_n=3):
    if pd.isna(title):
        return ""
    
    words = re.findall(r'\b\w+\b', title.lower())  # Tokenize words
    filtered_words = [word for word in words if word not in stop_words and word not in common_words]  # Remove stopwords
    word_counts = Counter(filtered_words)  # Count word frequencies
    most_common_words = [word for word, _ in word_counts.most_common(top_n)]  # Get top N frequent words
    
    return ', '.join(most_common_words)  # Convert list to a string

# Apply function to each row and create a new column
df_sampled['frequent_words'] = df_sampled['title'].apply(extract_frequent_words)

# Display the first few rows
print(df_sampled[['title', 'frequent_words']].head())


[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                      title  \
3201869   Timberland boots becoming trendy with Gigi Had...   
8662448   Fans Accuse Jay-Z and Beyoncé of Copying Kim K...   
6253700   'WAGS LA' And 'WAGS Miami' Both Officially Can...   
9737150   Who Is Hassan Jameel? Rihanna's Mystery Man Is...   
22131755  Who Benefits from President Trump’s Child Care...   

                       frequent_words  
3201869   timberland, boots, becoming  
8662448             fans, accuse, jay  
6253700               wags, la, miami  
9737150       mystery, hassan, jameel  
22131755   benefits, president, trump  


**Step 4: Compare Fake vs. Real News Patterns**

- Analyze differences in word usage between real and fake news:

In [35]:
fake_titles = df_sampled[df_sampled['label'] == 'fake']['frequent_words'].dropna()
real_titles = df_sampled[df_sampled['label'] == 'real']['frequent_words'].dropna()

fake_words = ' '.join(fake_titles).lower()
real_words = ' '.join(real_titles).lower()

fake_word_counts = Counter(re.findall(r'\b\w+\b', fake_words))
real_word_counts = Counter(re.findall(r'\b\w+\b', real_words))

print("Top Fake News Words:", fake_word_counts.most_common(20))
print("Top Real News Words:", real_word_counts.most_common(20))


Top Fake News Words: [('jennifer', 6884), ('kardashian', 6465), ('brad', 5542), ('jenner', 5315), ('pitt', 4559), ('kim', 4437), ('justin', 4079), ('angelina', 3881), ('selena', 3617), ('jolie', 3013), ('aniston', 2980), ('meghan', 2936), ('gomez', 2678), ('kylie', 2359), ('blake', 2347), ('trump', 2140), ('gwen', 2105), ('kanye', 2017), ('bieber', 1971), ('markle', 1878)]
Top Real News Words: [('kardashian', 9277), ('kim', 6081), ('says', 5689), ('prince', 5545), ('star', 5470), ('reveals', 5189), ('2018', 5145), ('jenner', 4873), ('meghan', 4863), ('season', 4815), ('jennifer', 4717), ('watch', 4466), ('kate', 4278), ('best', 3853), ('taylor', 3846), ('bachelor', 3417), ('harry', 3388), ('selena', 3359), ('shares', 3230), ('swift', 3220)]


**Step 5: Social Media Influence**
* Finds misinformation tweets.

* Counts how often each tweet ID appears.

* Displays the top 10 tweets spreading misinformation.



In [36]:
df_sampled['tweet_ids'].nunique()


21310

In [37]:
from collections import Counter

# Flatten the tweet_ids column (some tweets may have multiple IDs)
all_tweet_ids = df_sampled[df_sampled['label'] == 'fake']['tweet_ids'].dropna().str.split(',')
all_tweet_ids = [tweet_id.strip() for sublist in all_tweet_ids for tweet_id in sublist]  # Flatten list

# Count occurrences of each tweet ID
tweet_counts = Counter(all_tweet_ids)

# Get the top 10 tweets that spread the most misinformation
top_misinformation_tweets = tweet_counts.most_common(10)

# Display results
print("Top 10 Tweets Spreading the Most Misinformation (Fake News):")
for tweet, count in top_misinformation_tweets:
    print(f"Tweet ID: {tweet}, Count: {count}")


Top 10 Tweets Spreading the Most Misinformation (Fake News):
Tweet ID: 245855269077282817	251405156552425473	253244534341898241	259326121420652545	260841374696472576	260841372876165120	260884603496189954	260885797434183680	261503895623315457	261503899939246080	261503986518065152	261505911665864705	261507545208852480	261510712571092995	261511722932768769	261517416931147776	261519390103715840	261642538040037376	261738071396057089	261768185039175681	261890487043100672	262075386228248577	262645598963003392	262965981092134912	264461413110644736	264528144927694849	268885150471954432	269044189092786176	270633316493975552	270633712872476673	270730856971243520	276009732630786048	276788882249486337	277272378994749440	281110207386513409	281627076505792512	282008076230471680	282361978213761024	282742258489503745	285792120638406657	292729116669640705	294633879044759552	294633884082130944	294633889643761664	294774911879237633	299270877369991168	300427026022670336	300982549361090561	30128443731910656

In [38]:
from collections import Counter
import re

# Clean and tokenize titles
words = ' '.join(df_sampled['title'].dropna()).lower()
words = re.findall(r'\b\w+\b', words)

# Get top words
word_counts = Counter(words)
print(word_counts.most_common(20))  # Top 20 words


[('s', 130521), ('the', 127063), ('and', 101926), ('to', 97131), ('of', 65828), ('in', 64660), ('a', 62677), ('on', 53676), ('with', 51898), ('is', 49234), ('for', 48070), ('her', 47741), ('at', 31727), ('after', 28534), ('kardashian', 22482), ('new', 22062), ('about', 21607), ('she', 20908), ('from', 20666), ('his', 18454)]


Based on the root cause analysis above we can say that people create deepfakes and spread misinformation to gain attention using celebrity gossip for financial gain.

## Hypothesis Testing

### Explanation of the Statistical Test
1. Baseline Data (Before Detection)

- Counts the number of fake and real images.

- Calculates the proportion of fake images in the dataset.

2. Simulated Scenario (After Detection)

- Assumes a 50% reduction in fake images (based on hypothesis).

- Keeps the number of real images unchanged.

3. Two-Sample Proportion Z-Test

- Compares fake image proportions before and after detection.

- Null Hypothesis (𝐻0​): No difference in fake image proportions.

- Alternative Hypothesis (𝐻𝐴): The proportion of fake images decreases significantly.

4. Statistical Significance

- If p-value < 0.05, we reject 𝐻0 (supporting the hypothesis).

- If p-value > 0.05, we fail to reject 𝐻0 (no strong evidence of reduction).

In [39]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

# Load dataset (Assume df_sampled is your DataFrame)
df = df_sampled.copy()

# Count real and fake images before detection
fake_count_before = (df['label'] == 'fake').sum()
real_count_before = (df['label'] == 'real').sum()
total_before = fake_count_before + real_count_before

# Simulate the effect of deepfake detection (reduce fake images by 50%)
fake_count_after = fake_count_before / 2  # Assume a 50% reduction
real_count_after = real_count_before  # Real images remain the same
total_after = fake_count_after + real_count_after

# Perform a Two-Sample Proportion Z-Test
count = np.array([fake_count_before, fake_count_after])  # Fake counts before and after
nobs = np.array([total_before, total_after])  # Total images before and after
z_stat, p_value = proportions_ztest(count, nobs)

# Print results
print(f"📌 Fake Images Before Detection: {fake_count_before}")
print(f"📌 Fake Images After Detection (Simulated): {fake_count_after}")
print(f"📌 Z-Statistic: {z_stat:.4f}")
print(f"📌 P-Value: {p_value:.4f}")

# Interpretation
alpha = 0.05  # 5% significance level
if p_value < alpha:
    print("✅ The reduction in fake images is statistically significant. This supports the hypothesis.")
else:
    print("❌ No significant evidence that deepfake detection reduces fake images.")


📌 Fake Images Before Detection: 116342
📌 Fake Images After Detection (Simulated): 58171.0
📌 Z-Statistic: 125.0190
📌 P-Value: 0.0000
✅ The reduction in fake images is statistically significant. This supports the hypothesis.
