# Data Preparation

In [2]:
import pandas as pd
import numpy as np

In [3]:
misinfo_data = pd.read_csv("misinfo_dataset.csv")
misinfo_data.head()

Unnamed: 0,id,news_url,title,tweet_ids,misinformation_type,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,gossipcop,fake
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,gossipcop,fake
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,gossipcop,fake
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,gossipcop,fake
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,gossipcop,fake


In [4]:
misinfo_data.shape

(23196, 6)

In [5]:
misinfo_data.isnull().sum()

id                        0
news_url                330
title                     0
tweet_ids              1501
misinformation_type       0
label                     0
dtype: int64

Merge the dataset

In [6]:
import os

# Define the image folder path
image_folder = "compressed_deepfakes"  # Change this to your actual folder path

# Define dataset splits
splits = ["train","test", "validation"]

# Create a list to store image paths and labels
image_data = []

# Loop through each dataset split
for split in splits:
    for label in ["Real", "Fake"]:  # Assuming folders are named "Real" and "Fake"
        folder_path = os.path.join(image_folder, split, label)
        if os.path.exists(folder_path):  # Ensure folder exists
            for filename in os.listdir(folder_path):
                image_path = os.path.join(folder_path, filename)
                image_data.append({"image_path": image_path, "label": label.lower()})  # Convert to lowercase for consistency

# Convert the list into a DataFrame
image_df = pd.DataFrame(image_data)

# Load the CSV file
#csv_file = "misinfo_dataset.csv"  # Change this to your actual CSV file
csv_df = pd.read_csv("misinfo_dataset.csv")

# Ensure the label column is lowercase for consistency
csv_df["label"] = csv_df["label"].str.lower()

# Merge on the label column
merged_df = pd.merge(image_df, csv_df, on="label", how="inner")

# Save the merged dataset
#merged_df.to_csv("merged_dataset.csv", index=False)

# Display the first few rows
print(merged_df.head())

                                       image_path label                id  \
0  compressed_deepfakes/train/Real/real_10042.jpg  real  gossipcop-882573   
1  compressed_deepfakes/train/Real/real_10042.jpg  real  gossipcop-875924   
2  compressed_deepfakes/train/Real/real_10042.jpg  real  gossipcop-894416   
3  compressed_deepfakes/train/Real/real_10042.jpg  real  gossipcop-857248   
4  compressed_deepfakes/train/Real/real_10042.jpg  real  gossipcop-884684   

                                            news_url  \
0  https://www.brides.com/story/teen-mom-jenelle-...   
1  https://www.dailymail.co.uk/tvshowbiz/article-...   
2        https://en.wikipedia.org/wiki/Quinn_Perkins   
3  https://www.refinery29.com/en-us/2018/03/19192...   
4  https://www.cnn.com/2017/10/04/entertainment/c...   

                                               title  \
0  Teen Mom Star Jenelle Evans' Wedding Dress Is ...   
1  Kylie Jenner refusing to discuss Tyga on Life ...   
2                               

In [7]:
merged_df.shape

(25515600, 7)

Data Downsampling

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Set the sample size (e.g., 10% of original dataset)
sample_size = int(0.02 * len(merged_df))  # Adjust this fraction as needed

# Perform stratified sampling based on 'misinformation_type'
df_sampled, _ = train_test_split(merged_df, train_size=sample_size, stratify=merged_df['misinformation_type'], random_state=42)

# Save the reduced dataset
#df_sampled.to_csv("downsampled_dataset.csv", index=False)

print(f"Original dataset size: {len(merged_df)}")
print(f"Reduced dataset size: {len(df_sampled)}")


Original dataset size: 25515600
Reduced dataset size: 510312


In [9]:
df_sampled.head()

Unnamed: 0,image_path,label,id,news_url,title,tweet_ids,misinformation_type
3201869,compressed_deepfakes/train/Real/real_10340.jpg,real,gossipcop-903816,https://www.dailymail.co.uk/femail/article-527...,Timberland boots becoming trendy with Gigi Had...,949060933997023232\t949061822312845312\t949061...,gossipcop
8662448,compressed_deepfakes/train/Real/real_10259.jpg,real,gossipcop-922827,https://www.floor8.com/posts/8562-fans-accuse-...,Fans Accuse Jay-Z and Beyoncé of Copying Kim K...,977459151281557504,gossipcop
6253700,compressed_deepfakes/train/Real/real_10359.jpg,real,gossipcop-899993,https://okmagazine.com/photos/wags-la-wags-mia...,'WAGS LA' And 'WAGS Miami' Both Officially Can...,941404950726365184\t941405189889720320\t941405...,gossipcop
9737150,compressed_deepfakes/train/Fake/fake_10425.jpg,fake,gossipcop-3271653405,www.bustle.com/p/who-is-hassan-jameel-rihannas...,Who Is Hassan Jameel? Rihanna's Mystery Man Is...,886023658271014912\t886023887804387329\t886025...,gossipcop
22131755,compressed_deepfakes/validation/Real/real_1070...,real,politifact13833,http://www.taxpolicycenter.org/sites/default/f...,Who Benefits from President Trump’s Child Care...,775842098402566144\t836687647473037316\t836719...,politifact


Clean the Dataset

In [10]:
df_sampled.isnull().sum()

image_path                 0
label                      0
id                         0
news_url                7229
title                      0
tweet_ids              32924
misinformation_type        0
dtype: int64

In [11]:
# drop rows with missing values
df_sampled.dropna(inplace=True)

In [12]:
df_sampled.isnull().sum()

image_path             0
label                  0
id                     0
news_url               0
title                  0
tweet_ids              0
misinformation_type    0
dtype: int64

In [13]:
df_sampled.shape

(471042, 7)

# EDA
## Root Cause Analysis for Misinformation

Root Cause Analysis (RCA) helps identify why misinformation is created. Here’s how you can approach it:

**Step 1: Categorize Misinformation**
- Your dataset already has a 'misinformation_type' column.

- Count occurrences of each type:


In [14]:
df_sampled['misinformation_type'].value_counts()


misinformation_type
gossipcop     454400
politifact     16642
Name: count, dtype: int64

The most common misinformation type is gossipcop with 454,400 rows.

**Step 2: Analyze the Sources of Misinformation**
- Check which fake news URLs appear most often and how many times it appears:

In [15]:
# Filter rows where the 'label' column indicates "fake"
fake_news_df = df_sampled[df_sampled['label'] == 'fake']

# Count the frequency of each unique URL
fake_news_counts = fake_news_df['news_url'].value_counts().head(10)

# Filter the URLs with value counts greater than 1 (or whatever threshold you want)
fake_news_with_high_count = fake_news_counts[fake_news_counts > 1]

# Display the fake news URLs with high counts
print(fake_news_with_high_count)



news_url
www.newidea.com.au/jennifer-aniston-and-brad-pitt-busted-in-aspen                                                150
www.etonline.com/gwen-stefani-still-smitten-blake-shelton-and-has-never-once-tried-change-him-exclusive-94631    134
www.intouchweekly.com/posts/alex-rodriguez-cheated-on-jennifer-lopez-153623                                      128
hollywoodlife.com/2018/02/12/jamie-foxx-katie-holmes-baby-pregnant-want-to-start-family/                         122
www.etonline.com/biggest-celebrity-feuds-2017-92730                                                              122
en.wikipedia.org/wiki/Julia_Roberts                                                                              121
www.thecut.com/2018/08/brad-pitt-angelina-jolie-divorce-latest-news.html                                         117
hollywoodlife.com/2018/06/06/are-robert-pattinson-kristen-stewart-back-together/                                 113
www.newidea.com.au/meghan-markle-prince-harry-twins-con

All of these website pages are about celebrity gossip 

**Step 3: Keyword Analysis in Titles**

- Extract the most frequent words in misinformation titles.
- Removes stopwords using nltk.stopwords.words('english').
- Removes additional common words that might not add value.
- Finds the most frequent words while filtering out stopwords.


In [16]:
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Define additional common words to remove (optional)
common_words = {"news", "video", "report", "click", "link", "new", "breaking"}

# Function to extract most frequent words from a title
def extract_frequent_words(title, top_n=3):
    if pd.isna(title):
        return ""
    
    words = re.findall(r'\b\w+\b', title.lower())  # Tokenize words
    filtered_words = [word for word in words if word not in stop_words and word not in common_words]  # Remove stopwords
    word_counts = Counter(filtered_words)  # Count word frequencies
    most_common_words = [word for word, _ in word_counts.most_common(top_n)]  # Get top N frequent words
    
    return ', '.join(most_common_words)  # Convert list to a string

# Apply function to each row and create a new column
df_sampled['frequent_words'] = df_sampled['title'].apply(extract_frequent_words)

# Display the first few rows
print(df_sampled[['title', 'frequent_words']].head())


[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                      title  \
3201869   Timberland boots becoming trendy with Gigi Had...   
8662448   Fans Accuse Jay-Z and Beyoncé of Copying Kim K...   
6253700   'WAGS LA' And 'WAGS Miami' Both Officially Can...   
9737150   Who Is Hassan Jameel? Rihanna's Mystery Man Is...   
22131755  Who Benefits from President Trump’s Child Care...   

                       frequent_words  
3201869   timberland, boots, becoming  
8662448             fans, accuse, jay  
6253700               wags, la, miami  
9737150       mystery, hassan, jameel  
22131755   benefits, president, trump  


**Step 4: Compare Fake vs. Real News Patterns**

- Analyze differences in word usage between real and fake news:

In [17]:
fake_titles = df_sampled[df_sampled['label'] == 'fake']['frequent_words'].dropna()
real_titles = df_sampled[df_sampled['label'] == 'real']['frequent_words'].dropna()

fake_words = ' '.join(fake_titles).lower()
real_words = ' '.join(real_titles).lower()

fake_word_counts = Counter(re.findall(r'\b\w+\b', fake_words))
real_word_counts = Counter(re.findall(r'\b\w+\b', real_words))

print("Top Fake News Words:", fake_word_counts.most_common(20))
print("Top Real News Words:", real_word_counts.most_common(20))


Top Fake News Words: [('jennifer', 6884), ('kardashian', 6465), ('brad', 5542), ('jenner', 5315), ('pitt', 4559), ('kim', 4437), ('justin', 4079), ('angelina', 3881), ('selena', 3617), ('jolie', 3013), ('aniston', 2980), ('meghan', 2936), ('gomez', 2678), ('kylie', 2359), ('blake', 2347), ('trump', 2140), ('gwen', 2105), ('kanye', 2017), ('bieber', 1971), ('markle', 1878)]
Top Real News Words: [('kardashian', 9277), ('kim', 6081), ('says', 5689), ('prince', 5545), ('star', 5470), ('reveals', 5189), ('2018', 5145), ('jenner', 4873), ('meghan', 4863), ('season', 4815), ('jennifer', 4717), ('watch', 4466), ('kate', 4278), ('best', 3853), ('taylor', 3846), ('bachelor', 3417), ('harry', 3388), ('selena', 3359), ('shares', 3230), ('swift', 3220)]


Step 5: Get the financial value of the fake news using the urls

get the domain name of each unique url

In [18]:
fake_news_with_high_count

news_url
www.newidea.com.au/jennifer-aniston-and-brad-pitt-busted-in-aspen                                                150
www.etonline.com/gwen-stefani-still-smitten-blake-shelton-and-has-never-once-tried-change-him-exclusive-94631    134
www.intouchweekly.com/posts/alex-rodriguez-cheated-on-jennifer-lopez-153623                                      128
hollywoodlife.com/2018/02/12/jamie-foxx-katie-holmes-baby-pregnant-want-to-start-family/                         122
www.etonline.com/biggest-celebrity-feuds-2017-92730                                                              122
en.wikipedia.org/wiki/Julia_Roberts                                                                              121
www.thecut.com/2018/08/brad-pitt-angelina-jolie-divorce-latest-news.html                                         117
hollywoodlife.com/2018/06/06/are-robert-pattinson-kristen-stewart-back-together/                                 113
www.newidea.com.au/meghan-markle-prince-harry-twins-con

In [19]:
import pandas as pd
from urllib.parse import urlparse

# Sample data: Creating a DataFrame with a column containing URLs
data = {
    'urls': [
        "https://www.newidea.com.au/jennifer-aniston-and-brad-pitt-busted-in-aspen",
        "https://www.etonline.com/gwen-stefani-still-smitten-blake-shelton-and-has-never-once-tried-change-him-exclusive-94631",
        "https://www.intouchweekly.com/posts/alex-rodriguez-cheated-on-jennifer-lopez-153623",
        "https://hollywoodlife.com/2018/02/12/jamie-foxx-katie-holmes-baby-pregnant-want-to-start-family/",
        "https://www.etonline.com/biggest-celebrity-feuds-2017-92730",
        "https://en.wikipedia.org/wiki/Julia_Roberts",
        "https://www.thecut.com/2018/08/brad-pitt-angelina-jolie-divorce-latest-news.html",
        "https://hollywoodlife.com/2018/06/06/are-robert-pattinson-kristen-stewart-back-together/",
        "https://www.newidea.com.au/meghan-markle-prince-harry-twins-confirmed",
        "https://hollywoodlife.com/2018/08/08/kourtney-kardashian-pregnant-fourth-baby-younes-bendjima-dad-breakup/"
    ]
}

# Create a DataFrame
domain_df = pd.DataFrame(data)

# Function to extract domain name from URL
def get_domain_name(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

# Apply the function to the 'urls' column and create a new 'domain' column
domain_df['domain'] = domain_df['urls'].apply(get_domain_name)

# Display the DataFrame with the domain names
print(domain_df)



                                                urls                 domain
0  https://www.newidea.com.au/jennifer-aniston-an...     www.newidea.com.au
1  https://www.etonline.com/gwen-stefani-still-sm...       www.etonline.com
2  https://www.intouchweekly.com/posts/alex-rodri...  www.intouchweekly.com
3  https://hollywoodlife.com/2018/02/12/jamie-fox...      hollywoodlife.com
4  https://www.etonline.com/biggest-celebrity-feu...       www.etonline.com
5        https://en.wikipedia.org/wiki/Julia_Roberts       en.wikipedia.org
6  https://www.thecut.com/2018/08/brad-pitt-angel...         www.thecut.com
7  https://hollywoodlife.com/2018/06/06/are-rober...      hollywoodlife.com
8  https://www.newidea.com.au/meghan-markle-princ...     www.newidea.com.au
9  https://hollywoodlife.com/2018/08/08/kourtney-...      hollywoodlife.com


In [20]:
domain_df['domain'].unique()

array(['www.newidea.com.au', 'www.etonline.com', 'www.intouchweekly.com',
       'hollywoodlife.com', 'en.wikipedia.org', 'www.thecut.com'],
      dtype=object)