# Libraries

In [None]:
!pip install scikeras



In [None]:
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob #to use for sentiment analysis
from wordcloud import WordCloud

# Understanding and Preprocessing

## Mount it to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Specify the path to the directory where your CSV files are stored
main_directory = '/content/drive/My Drive/datasets/Fakenews_dataset'

# Load PolitiFact CSV files
politifact_mf = pd.read_csv(os.path.join(main_directory, 'politifact_MF.csv'))
politifact_hf = pd.read_csv(os.path.join(main_directory, 'politifact_HF.csv'))
politifact_mr = pd.read_csv(os.path.join(main_directory, 'politifact_MR.csv'))
politifact_hr = pd.read_csv(os.path.join(main_directory, 'politifact_HR.csv'))

# Load GossipCop CSV files
gossipcop_mf = pd.read_csv(os.path.join(main_directory, 'gossipcop_MF.csv'))
gossipcop_hf = pd.read_csv(os.path.join(main_directory, 'gossipcop_HF.csv'))
gossipcop_mr = pd.read_csv(os.path.join(main_directory, 'gossipcop_MR.csv'))
gossipcop_hr = pd.read_csv(os.path.join(main_directory, 'gossipcop_HR.csv'))

## Explore the Data (EDA)

In [None]:
def display_heads(datasets, n=5):
    for name, df in datasets.items():
        print(f"First {n} rows of {name}:")
        print(df.head(n))
        print("\n" + "="*50 + "\n")


# Store datasets in a dictionary
datasets = {
    'PolitiFact MF': politifact_mf,
    'PolitiFact HF': politifact_hf,
    'PolitiFact MR': politifact_mr,
    'PolitiFact HR': politifact_hr,
    'GossipCop MF': gossipcop_mf,
    'GossipCop HF': gossipcop_hf,
    'GossipCop MR': gossipcop_mr,
    'GossipCop HR': gossipcop_hr
}

# Display the first few rows of each dataset
display_heads(datasets)

First 5 rows of PolitiFact MF:
                id                                        description  \
0  politifact11773  Republican attacks on transgendered Americans ...   
1  politifact13827  Whoopi Goldberg is in hot water after comments...   
2  politifact13570  Washington, DC — A former Secret Service agent...   
3  politifact14947  Bill Clinton’s hitman has confessed to more th...   
4  politifact14517  Scott&#8217;s prognosis isn&#8217;t good. (via...   

                                                text  \
0  inia State Representative Mark Cole's proposed...   
1  Whoopi Goldberg has found herself in the middl...   
2  A former Secret Service agent has written a ne...   
3  In what appears to be a major blow to the cred...   
4  In a shocking turn of events, Florida Governor...   

                                               title  
0  Virginia Republican Introduces Controversial B...  
1  Whoopi Goldberg Faces Backlash for Disrespectf...  
2  Former Secret Service Age

In [None]:
def display_info(datasets):
    for name, df in datasets.items():
        print(f"Information about {name}:")
        df.info()
        print("\n" + "="*50 + "\n")

# Display information about each dataset
display_info(datasets)

Information about PolitiFact MF:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           97 non-null     object
 1   description  97 non-null     object
 2   text         97 non-null     object
 3   title        97 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


Information about PolitiFact HF:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           97 non-null     object
 1   description  97 non-null     object
 2   text         97 non-null     object
 3   title        97 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


Information about PolitiFact MR:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 4 columns):
 #   Column      

In [None]:
def display_descriptions(datasets):
    for name, df in datasets.items():
        print(f"Descriptive statistics for {name}:")
        print(df.describe(include='all'))
        print("\n" + "="*50 + "\n")

# Display descriptive statistics for each dataset
display_descriptions(datasets)

Descriptive statistics for PolitiFact MF:
                     id description  \
count                97          97   
unique               97          96   
top     politifact11773     Tribune   
freq                  1           2   

                                                     text  \
count                                                  97   
unique                                                 97   
top     inia State Representative Mark Cole's proposed...   
freq                                                    1   

                                                    title  
count                                                  97  
unique                                                 97  
top     Virginia Republican Introduces Controversial B...  
freq                                                    1  


Descriptive statistics for PolitiFact HF:
                     id description  \
count                97          97   
unique               97          9

#### Missing values

In [None]:
def display_missing_values(datasets):
    for name, df in datasets.items():
        print(f"Missing values in {name}:")
        missing_values = df.isnull().sum()
        print(missing_values[missing_values > 0])  # Display only columns with missing values
        print("\n" + "="*50 + "\n")

# Display missing values for each dataset
display_missing_values(datasets)

Missing values in PolitiFact MF:
Series([], dtype: int64)


Missing values in PolitiFact HF:
Series([], dtype: int64)


Missing values in PolitiFact MR:
Series([], dtype: int64)


Missing values in PolitiFact HR:
title    1
dtype: int64


Missing values in GossipCop MF:
Series([], dtype: int64)


Missing values in GossipCop HF:
Series([], dtype: int64)


Missing values in GossipCop MR:
Series([], dtype: int64)


Missing values in GossipCop HR:
Series([], dtype: int64)




In [None]:
# Dropping the NA value
politifact_hr.dropna(subset=['title'], inplace=True)

## Combined df

### Feature Engineering

In [None]:
import pandas as pd

# Assuming your datasets are already loaded as DataFrames
# Assign labels and sources based on your description
datasets = {
    'PolitiFact MF': (politifact_mf, 'PolitiFact', 'LLM Generated', 'Fake'),
    'PolitiFact HF': (politifact_hf, 'PolitiFact', 'Human Written', 'Fake'),
    'PolitiFact MR': (politifact_mr, 'PolitiFact', 'LLM Generated', 'Real'),
    'PolitiFact HR': (politifact_hr, 'PolitiFact', 'Human Written', 'Real'),
    'GossipCop MF': (gossipcop_mf, 'GossipCop', 'LLM Generated', 'Fake'),
    'GossipCop HF': (gossipcop_hf, 'GossipCop', 'Human Written', 'Fake'),
    'GossipCop MR': (gossipcop_mr, 'GossipCop', 'LLM Generated', 'Real'),
    'GossipCop HR': (gossipcop_hr, 'GossipCop', 'Human Written', 'Real')
}

# List to hold all dataframes with additional columns
df_list = []

# Iterate over datasets to add the 'platform', 'source', and 'label' columns
for name, (df, platform, source, label) in datasets.items():
    df = df.copy()  # Avoid modifying the original data
    df['platform'] = platform
    df['source'] = source
    df['label'] = label
    df_list.append(df)

# Concatenate all datasets into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the combined DataFrame to verify
print(combined_df.head())
print(combined_df['platform'].value_counts())
print(combined_df['source'].value_counts())
print(combined_df['label'].value_counts())


                id                                        description  \
0  politifact11773  Republican attacks on transgendered Americans ...   
1  politifact13827  Whoopi Goldberg is in hot water after comments...   
2  politifact13570  Washington, DC — A former Secret Service agent...   
3  politifact14947  Bill Clinton’s hitman has confessed to more th...   
4  politifact14517  Scott&#8217;s prognosis isn&#8217;t good. (via...   

                                                text  \
0  inia State Representative Mark Cole's proposed...   
1  Whoopi Goldberg has found herself in the middl...   
2  A former Secret Service agent has written a ne...   
3  In what appears to be a major blow to the cred...   
4  In a shocking turn of events, Florida Governor...   

                                               title    platform  \
0  Virginia Republican Introduces Controversial B...  PolitiFact   
1  Whoopi Goldberg Faces Backlash for Disrespectf...  PolitiFact   
2  Former Secret Ser

In [None]:
combined_df

Unnamed: 0,id,description,text,title,platform,source,label
0,politifact11773,Republican attacks on transgendered Americans ...,inia State Representative Mark Cole's proposed...,Virginia Republican Introduces Controversial B...,PolitiFact,LLM Generated,Fake
1,politifact13827,Whoopi Goldberg is in hot water after comments...,Whoopi Goldberg has found herself in the middl...,Whoopi Goldberg Faces Backlash for Disrespectf...,PolitiFact,LLM Generated,Fake
2,politifact13570,"Washington, DC — A former Secret Service agent...",A former Secret Service agent has written a ne...,Former Secret Service Agent Exposes Shocking S...,PolitiFact,LLM Generated,Fake
3,politifact14947,Bill Clinton’s hitman has confessed to more th...,In what appears to be a major blow to the cred...,Hannity Issues Retraction on False Story: Bill...,PolitiFact,LLM Generated,Fake
4,politifact14517,Scott&#8217;s prognosis isn&#8217;t good. (via...,"In a shocking turn of events, Florida Governor...",Florida Governor Rick Scott Miraculously Recov...,PolitiFact,LLM Generated,Fake
...,...,...,...,...,...,...,...
21019,gossipcop-875489,From hand-baked clay tiles to LED lights that ...,For free real time breaking news alerts sent s...,The top interior design trends for millennials,GossipCop,Human Written,Real
21020,gossipcop-844263,Gilmore Girls: A Year in the Life made its Net...,Gilmore Girls: A Year in the Life made its Net...,"Gilmore Girls Video: Lauren Graham, Alexis Ble...",GossipCop,Human Written,Real
21021,gossipcop-917467,On Sunday Fox aired “O.J. Simpson: The Lost Co...,Why Is It Airing Now?\n\nAccording to the exec...,"The O.J. Simpson Interview on Fox: Gripping, G...",GossipCop,Human Written,Real
21022,gossipcop-924877,Just when you thought this season of Vanderpum...,Just when you thought this season of Vanderpum...,Kristen Doute and James Kennedy Hooked Up Rumo...,GossipCop,Human Written,Real


#### Combining two columns (for multi class task)

In [None]:
# Create the combined column by concatenating the 'source' and 'label' columns
combined_df['source_label'] = combined_df['source'] + '_' + combined_df['label']

# Display the first few rows to verify the new column
print(combined_df[['source', 'label', 'source_label']].head())

          source label        source_label
0  LLM Generated  Fake  LLM Generated_Fake
1  LLM Generated  Fake  LLM Generated_Fake
2  LLM Generated  Fake  LLM Generated_Fake
3  LLM Generated  Fake  LLM Generated_Fake
4  LLM Generated  Fake  LLM Generated_Fake


In [None]:
combined_df.head()

Unnamed: 0,id,description,text,title,platform,source,label,source_label
0,politifact11773,Republican attacks on transgendered Americans ...,inia State Representative Mark Cole's proposed...,Virginia Republican Introduces Controversial B...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
1,politifact13827,Whoopi Goldberg is in hot water after comments...,Whoopi Goldberg has found herself in the middl...,Whoopi Goldberg Faces Backlash for Disrespectf...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
2,politifact13570,"Washington, DC — A former Secret Service agent...",A former Secret Service agent has written a ne...,Former Secret Service Agent Exposes Shocking S...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
3,politifact14947,Bill Clinton’s hitman has confessed to more th...,In what appears to be a major blow to the cred...,Hannity Issues Retraction on False Story: Bill...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
4,politifact14517,Scott&#8217;s prognosis isn&#8217;t good. (via...,"In a shocking turn of events, Florida Governor...",Florida Governor Rick Scott Miraculously Recov...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake


#### Text Analysis Before Cleaning

##### Readibility score (before)

In [None]:
pip install textstat



In [None]:
import textstat

# Calculate readability scores before text cleaning
combined_df['description_readability_before'] = combined_df['description'].apply(textstat.flesch_reading_ease)
combined_df['text_readability_before'] = combined_df['text'].apply(textstat.flesch_reading_ease)
combined_df['title_readability_before'] = combined_df['title'].apply(textstat.flesch_reading_ease)

# Display the first few rows to verify the new columns
print(combined_df[['description', 'description_readability_before', 'text', 'text_readability_before', 'title', 'title_readability_before']].head())


                                         description  \
0  Republican attacks on transgendered Americans ...   
1  Whoopi Goldberg is in hot water after comments...   
2  Washington, DC — A former Secret Service agent...   
3  Bill Clinton’s hitman has confessed to more th...   
4  Scott&#8217;s prognosis isn&#8217;t good. (via...   

   description_readability_before  \
0                           15.65   
1                           65.93   
2                           43.74   
3                           68.30   
4                           14.63   

                                                text  text_readability_before  \
0  inia State Representative Mark Cole's proposed...                    36.12   
1  Whoopi Goldberg has found herself in the middl...                    45.19   
2  A former Secret Service agent has written a ne...                    50.57   
3  In what appears to be a major blow to the cred...                    35.95   
4  In a shocking turn of events, Fl

##### Text Length (before)

In [None]:
# Calculate text length before cleaning
combined_df['title_length_before'] = combined_df['title'].apply(len)
combined_df['description_length_before'] = combined_df['description'].apply(len)
combined_df['text_length_before'] = combined_df['text'].apply(len)

## Text Cleaning

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure you have downloaded the required NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

def text_preprocessing(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Work on a copy of the text to avoid modifying the original
    processed_text = text.lower()

    # Remove URLs
    processed_text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', processed_text)

    # Remove text enclosed in square brackets
    processed_text = re.sub(r'\[.*?\]', '', processed_text)

    # Remove text enclosed in angle brackets
    processed_text = re.sub(r'<.*?>+', '', processed_text)

    # Remove punctuation characters
    processed_text = re.sub(r'[%s]' % re.escape(string.punctuation), '', processed_text)

    # Remove newline characters
    processed_text = re.sub(r'\n', '', processed_text)

    # Remove substrings of digits surrounded by word characters
    processed_text = re.sub(r'\w*\d\w*', '', processed_text)

    # Remove emails
    processed_text = re.sub(r'@\S+', '', processed_text)

    # Remove numbers
    processed_text = re.sub(r'\b\d+\b', '', processed_text)

    # Remove repeating characters
    processed_text = re.sub(r'(.)\1+', r'\1', processed_text)

    # Tokenize, remove stopwords, stem, and lemmatize
    words = word_tokenize(processed_text)
    words = [word for word in words if word not in stop_words and len(word) > 1]
    words = [stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the processed words back into a single string
    processed_text = " ".join(words)

    return processed_text

### Apply the text_preprocessing function on title, description, and text columns

In [None]:
# Apply the cleaning function on title, description, and text columns
combined_df['cleaned_description'] = combined_df['description'].apply(text_preprocessing)
combined_df['cleaned_text'] = combined_df['text'].apply(text_preprocessing)
combined_df['cleaned_title'] = combined_df['title'].apply(text_preprocessing)

## Text Length (after)

In [None]:
# Calculate text length before cleaning
combined_df['title_length_after'] = combined_df['title'].apply(len)
combined_df['description_length_after'] = combined_df['description'].apply(len)
combined_df['text_length_after'] = combined_df['text'].apply(len)

## Sentiment Analysis

### Sentiment (before)

In [None]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

combined_df['title_sentiment'] = combined_df['title'].apply(get_sentiment)
combined_df['text_sentiment'] = combined_df['text'].apply(get_sentiment)
combined_df['description_sentiment'] = combined_df['description'].apply(get_sentiment)

### Sentiment (After)

In [None]:
# Apply sentiment analysis on cleaned data
combined_df['cleaned_title_sentiment'] = combined_df['cleaned_title'].apply(get_sentiment)
combined_df['cleaned_text_sentiment'] = combined_df['cleaned_text'].apply(get_sentiment)
combined_df['cleaned_description_sentiment'] = combined_df['cleaned_description'].apply(get_sentiment)


## Readability Score (After)

In [None]:
# Calculate readability scores after text cleaning
combined_df['description_readability_after'] = combined_df['cleaned_description'].apply(textstat.flesch_reading_ease)
combined_df['text_readability_after'] = combined_df['cleaned_text'].apply(textstat.flesch_reading_ease)
combined_df['title_readability_after'] = combined_df['cleaned_title'].apply(textstat.flesch_reading_ease)

In [None]:
combined_df.columns

Index(['id', 'description', 'text', 'title', 'platform', 'source', 'label',
       'source_label', 'description_readability_before',
       'text_readability_before', 'title_readability_before',
       'title_length_before', 'description_length_before',
       'text_length_before', 'cleaned_description', 'cleaned_text',
       'cleaned_title', 'title_length_after', 'description_length_after',
       'text_length_after', 'title_sentiment', 'text_sentiment',
       'description_sentiment', 'cleaned_title_sentiment',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'description_readability_after', 'text_readability_after',
       'title_readability_after'],
      dtype='object')

## Model Preparation

#### Scaling (Standardisation)

In [None]:
# Import the StandardScaler class from scikit-learn
from sklearn.preprocessing import StandardScaler

# List of columns in the DataFrame that need to be standardized
columns_to_standardize = [
    'text_length_before',  # Length of text before cleaning
    'description_length_before',  # Length of description before cleaning
    'title_length_before',  # Length of title before cleaning
    'text_length_after',  # Length of text after cleaning
    'description_length_after',  # Length of description after cleaning
    'title_length_after',  # Length of title after cleaning
    'text_readability_before',  # Readability score of text before cleaning
    'description_readability_before',  # Readability score of description before cleaning
    'title_readability_before',  # Readability score of title before cleaning
    'cleaned_text_sentiment',  # Sentiment score of cleaned text
    'cleaned_description_sentiment',  # Sentiment score of cleaned description
    'cleaned_title_sentiment'  # Sentiment score of cleaned title
]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the selected columns and transform them
combined_df[columns_to_standardize] = scaler.fit_transform(combined_df[columns_to_standardize])

# Print the first few rows of the standardized columns to verify the transformation
print(combined_df[columns_to_standardize].head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

## Split the daraframe

In [None]:
# Split the combined_df dataframe by platform

# Create a separate dataframe for each platform
gossipcop_df = combined_df[combined_df['platform'] == 'GossipCop']
politifact_df = combined_df[combined_df['platform'] == 'PolitiFact']

# Print the shapes of the new dataframes
print(f"GossipCop dataframe shape: {gossipcop_df.shape}")
print(f"PolitiFact dataframe shape: {politifact_df.shape}")

# Print the shape of the combined dataframe
print(f"Combined dataframe shape: {combined_df.shape}")

GossipCop dataframe shape: (20505, 29)
PolitiFact dataframe shape: (519, 29)
Combined dataframe shape: (21024, 29)


In [None]:
combined_df.columns

Index(['id', 'description', 'text', 'title', 'platform', 'source', 'label',
       'source_label', 'description_readability_before',
       'text_readability_before', 'title_readability_before',
       'title_length_before', 'description_length_before',
       'text_length_before', 'cleaned_description', 'cleaned_text',
       'cleaned_title', 'title_length_after', 'description_length_after',
       'text_length_after', 'title_sentiment', 'text_sentiment',
       'description_sentiment', 'cleaned_title_sentiment',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'description_readability_after', 'text_readability_after',
       'title_readability_after'],
      dtype='object')

### Dropping the columns

In [None]:
# List of columns to keep, which includes the scaled columns and the 'source_label' column
columns_to_keep = columns_to_standardize + ['label', 'source', 'cleaned_text']

# Create a new DataFrame with only the selected columns
combined_df1 = combined_df[columns_to_keep]
gossipcop_df1 = gossipcop_df[columns_to_keep]
politifact_df1 = politifact_df[columns_to_keep]

# Print the first few rows of the new DataFrame to verify
print(combined_df1.head())
print(gossipcop_df1.head())
print(politifact_df1.head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

### One hot Encoding

In [None]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'label', 'source', 'cleaned_text'],
      dtype='object')

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, dtype=int)  # Setting dtype=int to ensure integer output

# Fit and transform both 'label' and 'source' columns
encoded_columns = encoder.fit_transform(combined_df1[['label', 'source']])

# Convert the encoded columns into a DataFrame with proper column names
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['label', 'source']))

# Concatenate the encoded columns back to the original dataframe
combined_df1 = pd.concat([combined_df1, encoded_df], axis=1)

# Optionally, drop the original 'label' and 'source' columns
combined_df1 = combined_df1.drop(['label', 'source'], axis=1)



In [None]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'cleaned_text', 'label_Fake', 'label_Real',
       'source_Human Written', 'source_LLM Generated'],
      dtype='object')

In [None]:
# Dropping the columns related to the target
combined_df1 = combined_df1.drop(['label_Real','source_Human Written'], axis=1)

In [None]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'cleaned_text', 'label_Fake',
       'source_LLM Generated'],
      dtype='object')

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Adjust max_features as needed

# Apply TF-IDF on the 'cleaned_text' column
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df1['cleaned_text'])

# Convert the TF-IDF matrix to a DataFrame for easier handling
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the TF-IDF features with the original DataFrame (excluding 'cleaned_text')
combined_df1_tfidf = pd.concat([combined_df1.drop('cleaned_text', axis=1).reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the updated DataFrame
print(combined_df1_tfidf.head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

In [None]:
combined_df1_tfidf.shape

(21024, 1014)

In [None]:
combined_df1_tfidf.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment',
       ...
       'world', 'worth', 'write', 'wrong', 'wrote', 'ye', 'year', 'york',
       'young', 'youtub'],
      dtype='object', length=1014)

##### Distribution of the target column

In [None]:
class_distribution = combined_df1['label_Fake'].value_counts()
print(class_distribution)

label_Fake
0    12662
1     8362
Name: count, dtype: int64


In [None]:
# Check if 'source_LLM Generated' exists in combined_df1_tfidf
if 'source_LLM Generated' in combined_df1_tfidf.columns:
    print("'source_LLM Generated' exists in combined_df1_tfidf")
else:
    print("'source_LLM Generated' does not exist in combined_df1_tfidf")


'source_LLM Generated' exists in combined_df1_tfidf


## Dataset Splitting

In [None]:
from sklearn.model_selection import train_test_split

df = combined_df1_tfidf

# Define X (features) and y (target)
X = df.drop('label_Fake', axis=1)  # Features (all columns except the target)
y = df['label_Fake']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (14716, 1013)
X_test shape: (6308, 1013)
y_train shape: (14716,)
y_test shape: (6308,)


In [None]:
# Check if 'source_LLM Generated' exists in X_train
if 'source_LLM Generated' in X_train.columns:
    print("'source_LLM Generated' exists in X_train")
else:
    print("'source_LLM Generated' does not exist in X_train")

# Check if 'source_LLM Generated' exists in X_test
if 'source_LLM Generated' in X_test.columns:
    print("'source_LLM Generated' exists in X_test")
else:
    print("'source_LLM Generated' does not exist in X_test")


'source_LLM Generated' exists in X_train
'source_LLM Generated' exists in X_test


### Balancing the training set

In [None]:
from collections import Counter

# Check the distribution of classes in y_train
class_distribution = Counter(y_train)
print(f"Original class distribution in y_train: {class_distribution}")

Original class distribution in y_train: Counter({0: 8863, 1: 5853})


In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print(f"Resampled target distribution: {Counter(y_train_resampled)}")

Resampled target distribution: Counter({0: 8863, 1: 8863})


In [None]:
# Check if 'source_LLM Generated' exists in X_train_resampled
if 'source_LLM Generated' in X_train_resampled.columns:
    print("'source_LLM Generated' exists in X_train_resampled")
else:
    print("'source_LLM Generated' does not exist in X_train_resampled")


'source_LLM Generated' exists in X_train_resampled


In [None]:
# Check if 'source_LLM Generated' exists in X_test
if 'source_LLM Generated' in X_test.columns:
    print("'source_LLM Generated' exists in X_test")
else:
    print("'source_LLM Generated' does not exist in X_test")


'source_LLM Generated' exists in X_test


## Neural Netwroks

### Train and Evaluate the Initial NN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Define the Neural Network model
model = Sequential()

# Input layer and the first hidden layer with ReLU activation
model.add(Dense(128, input_dim=X_train_resampled.shape[1], activation='relu'))

# Second hidden layer
model.add(Dense(64, activation='relu'))

# Output layer with sigmoid for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
# - Binary crossentropy is used for binary classification
# - Adam optimizer with a learning rate of 0.001
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
# - Use validation_data as the test set to monitor performance
model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32").ravel()  # Convert probabilities to class labels (0 or 1)

# Evaluate the Neural Network model on the test set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Neural Network Model - Accuracy: {accuracy:.4f}")
print(f"Neural Network Model - Precision: {precision:.4f}")
print(f"Neural Network Model - Recall: {recall:.4f}")
print(f"Neural Network Model - F1-Score: {f1:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Model - Accuracy: 0.8570
Neural Network Model - Precision: 0.8572
Neural Network Model - Recall: 0.8570
Neural Network Model - F1-Score: 0.8571
Confusion Matrix:
[[3339  460]
 [ 442 2067]]


### Tune and Evaluate the NN Model

In [None]:
from tensorflow.keras.layers import Dropout

# Define a new Neural Network model with Dropout
model = Sequential()

# Input layer and first hidden layer with ReLU activation
model.add(Dense(128, input_dim=X_train_resampled.shape[1], activation='relu'))
model.add(Dropout(0.3))  # Add Dropout to prevent overfitting

# Second hidden layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

# Output layer with sigmoid for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with Adam optimizer and a lower learning rate
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), metrics=['accuracy'])

# Train the model
model.fit(X_train_resampled, y_train_resampled, epochs=20, batch_size=64, validation_data=(X_test, y_test))

# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32").ravel()

# Evaluate the tuned Neural Network model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Tuned Neural Network Model - Accuracy: {accuracy:.4f}")
print(f"Tuned Neural Network Model - Precision: {precision:.4f}")
print(f"Tuned Neural Network Model - Recall: {recall:.4f}")
print(f"Tuned Neural Network Model - F1-Score: {f1:.4f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Tuned Neural Network Model - Accuracy: 0.8573
Tuned Neural Network Model - Precision: 0.8571
Tuned Neural Network Model - Recall: 0.8573
Tuned Neural Network Model - F1-Score: 0.8572
Confusion Matrix:
[[3365  434]
 [ 466 2043]]


### Evaluate the Fairness of the NN Model

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix

# Assuming `sensitive_attr` holds the sensitive attribute 'source_LLM Generated' (0s and 1s)
sensitive_attr = X_test['source_LLM Generated']  # Ensure this is the correct column name

# Get predictions from the tuned model (already computed as `y_pred`)
# y_pred already exists from previous tuned model evaluation code

# Split predictions and true labels based on the sensitive attribute
y_true_0 = y_test[sensitive_attr == 0]
y_pred_0 = y_pred[sensitive_attr == 0]

y_true_1 = y_test[sensitive_attr == 1]
y_pred_1 = y_pred[sensitive_attr == 1]

# 1. Demographic Parity
# Proportion of positive predictions for both groups
demographic_parity_0 = np.mean(y_pred_0)
demographic_parity_1 = np.mean(y_pred_1)
demographic_parity_diff = demographic_parity_1 - demographic_parity_0

print(f"Demographic Parity - Group 0: {demographic_parity_0:.4f}, Group 1: {demographic_parity_1:.4f}")
print(f"Difference in Demographic Parity: {demographic_parity_diff:.4f}")

# 2. Equal Opportunity (True Positive Rate comparison)
# TPR for both groups
true_positives_0 = np.sum((y_true_0 == 1) & (y_pred_0 == 1))
true_positives_1 = np.sum((y_true_1 == 1) & (y_pred_1 == 1))

tpr_0 = true_positives_0 / np.sum(y_true_0 == 1)
tpr_1 = true_positives_1 / np.sum(y_true_1 == 1)
equal_opportunity_diff = tpr_1 - tpr_0

print(f"Equal Opportunity - TPR for Group 0: {tpr_0:.4f}, Group 1: {tpr_1:.4f}")
print(f"Difference in Equal Opportunity (TPR): {equal_opportunity_diff:.4f}")

# 3. Equalized Odds (TPR and FPR comparison)
# FPR for both groups
false_positives_0 = np.sum((y_true_0 == 0) & (y_pred_0 == 1))
false_positives_1 = np.sum((y_true_1 == 0) & (y_pred_1 == 1))

fpr_0 = false_positives_0 / np.sum(y_true_0 == 0)
fpr_1 = false_positives_1 / np.sum(y_true_1 == 0)
equalized_odds_tpr_diff = tpr_1 - tpr_0
equalized_odds_fpr_diff = fpr_0 - fpr_1

print(f"Equalized Odds - TPR for Group 0: {tpr_0:.4f}, Group 1: {tpr_1:.4f}")
print(f"Equalized Odds - FPR for Group 0: {fpr_0:.4f}, Group 1: {fpr_1:.4f}")
print(f"Difference in TPR (Equalized Odds): {equalized_odds_tpr_diff:.4f}")
print(f"Difference in FPR (Equalized Odds): {equalized_odds_fpr_diff:.4f}")

# 4. Predictive Parity (Precision comparison)
precision_0 = precision_score(y_true_0, y_pred_0)
precision_1 = precision_score(y_true_1, y_pred_1)
predictive_parity_diff = precision_1 - precision_0

print(f"Predictive Parity - Precision for Group 0: {precision_0:.4f}, Group 1: {precision_1:.4f}")
print(f"Difference in Predictive Parity (Precision): {predictive_parity_diff:.4f}")

# Summary of Differences:
overall_summary = {
    "Demographic Parity Difference": demographic_parity_diff,
    "Equal Opportunity (TPR) Difference": equal_opportunity_diff,
    "Equalized Odds TPR Difference": equalized_odds_tpr_diff,
    "Equalized Odds FPR Difference": equalized_odds_fpr_diff,
    "Predictive Parity Difference": predictive_parity_diff
}

print("\n=== Overall Fairness Summary ===")
for metric, diff in overall_summary.items():
    print(f"{metric}: {diff:.4f}")

Demographic Parity - Group 0: 0.3182, Group 1: 0.5044
Difference in Demographic Parity: 0.1862
Equal Opportunity - TPR for Group 0: 0.6818, Group 1: 0.9421
Difference in Equal Opportunity (TPR): 0.2602
Equalized Odds - TPR for Group 0: 0.6818, Group 1: 0.9421
Equalized Odds - FPR for Group 0: 0.1426, Group 1: 0.0561
Difference in TPR (Equalized Odds): 0.2602
Difference in FPR (Equalized Odds): 0.0865
Predictive Parity - Precision for Group 0: 0.6977, Group 1: 0.9450
Difference in Predictive Parity (Precision): 0.2473

=== Overall Fairness Summary ===
Demographic Parity Difference: 0.1862
Equal Opportunity (TPR) Difference: 0.2602
Equalized Odds TPR Difference: 0.2602
Equalized Odds FPR Difference: 0.0865
Predictive Parity Difference: 0.2473
