# Libraries

In [1]:
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob #to use for sentiment analysis
from wordcloud import WordCloud

# Understanding and Preprocessing

## Mount it to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Specify the path to the directory where your CSV files are stored
main_directory = '/content/drive/My Drive/datasets/Fakenews_dataset'

# Load PolitiFact CSV files
politifact_mf = pd.read_csv(os.path.join(main_directory, 'politifact_MF.csv'))
politifact_hf = pd.read_csv(os.path.join(main_directory, 'politifact_HF.csv'))
politifact_mr = pd.read_csv(os.path.join(main_directory, 'politifact_MR.csv'))
politifact_hr = pd.read_csv(os.path.join(main_directory, 'politifact_HR.csv'))

# Load GossipCop CSV files
gossipcop_mf = pd.read_csv(os.path.join(main_directory, 'gossipcop_MF.csv'))
gossipcop_hf = pd.read_csv(os.path.join(main_directory, 'gossipcop_HF.csv'))
gossipcop_mr = pd.read_csv(os.path.join(main_directory, 'gossipcop_MR.csv'))
gossipcop_hr = pd.read_csv(os.path.join(main_directory, 'gossipcop_HR.csv'))

## Explore the Data (EDA)

In [4]:
def display_heads(datasets, n=5):
    for name, df in datasets.items():
        print(f"First {n} rows of {name}:")
        print(df.head(n))
        print("\n" + "="*50 + "\n")


# Store datasets in a dictionary
datasets = {
    'PolitiFact MF': politifact_mf,
    'PolitiFact HF': politifact_hf,
    'PolitiFact MR': politifact_mr,
    'PolitiFact HR': politifact_hr,
    'GossipCop MF': gossipcop_mf,
    'GossipCop HF': gossipcop_hf,
    'GossipCop MR': gossipcop_mr,
    'GossipCop HR': gossipcop_hr
}

# Display the first few rows of each dataset
display_heads(datasets)

First 5 rows of PolitiFact MF:
                id                                        description  \
0  politifact11773  Republican attacks on transgendered Americans ...   
1  politifact13827  Whoopi Goldberg is in hot water after comments...   
2  politifact13570  Washington, DC — A former Secret Service agent...   
3  politifact14947  Bill Clinton’s hitman has confessed to more th...   
4  politifact14517  Scott&#8217;s prognosis isn&#8217;t good. (via...   

                                                text  \
0  inia State Representative Mark Cole's proposed...   
1  Whoopi Goldberg has found herself in the middl...   
2  A former Secret Service agent has written a ne...   
3  In what appears to be a major blow to the cred...   
4  In a shocking turn of events, Florida Governor...   

                                               title  
0  Virginia Republican Introduces Controversial B...  
1  Whoopi Goldberg Faces Backlash for Disrespectf...  
2  Former Secret Service Age

In [5]:
def display_info(datasets):
    for name, df in datasets.items():
        print(f"Information about {name}:")
        df.info()
        print("\n" + "="*50 + "\n")

# Display information about each dataset
display_info(datasets)

Information about PolitiFact MF:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           97 non-null     object
 1   description  97 non-null     object
 2   text         97 non-null     object
 3   title        97 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


Information about PolitiFact HF:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           97 non-null     object
 1   description  97 non-null     object
 2   text         97 non-null     object
 3   title        97 non-null     object
dtypes: object(4)
memory usage: 3.2+ KB


Information about PolitiFact MR:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 4 columns):
 #   Column      

In [6]:
def display_descriptions(datasets):
    for name, df in datasets.items():
        print(f"Descriptive statistics for {name}:")
        print(df.describe(include='all'))
        print("\n" + "="*50 + "\n")

# Display descriptive statistics for each dataset
display_descriptions(datasets)

Descriptive statistics for PolitiFact MF:
                     id description  \
count                97          97   
unique               97          96   
top     politifact11773     Tribune   
freq                  1           2   

                                                     text  \
count                                                  97   
unique                                                 97   
top     inia State Representative Mark Cole's proposed...   
freq                                                    1   

                                                    title  
count                                                  97  
unique                                                 97  
top     Virginia Republican Introduces Controversial B...  
freq                                                    1  


Descriptive statistics for PolitiFact HF:
                     id description  \
count                97          97   
unique               97          9

#### Missing values

In [7]:
def display_missing_values(datasets):
    for name, df in datasets.items():
        print(f"Missing values in {name}:")
        missing_values = df.isnull().sum()
        print(missing_values[missing_values > 0])  # Display only columns with missing values
        print("\n" + "="*50 + "\n")

# Display missing values for each dataset
display_missing_values(datasets)

Missing values in PolitiFact MF:
Series([], dtype: int64)


Missing values in PolitiFact HF:
Series([], dtype: int64)


Missing values in PolitiFact MR:
Series([], dtype: int64)


Missing values in PolitiFact HR:
title    1
dtype: int64


Missing values in GossipCop MF:
Series([], dtype: int64)


Missing values in GossipCop HF:
Series([], dtype: int64)


Missing values in GossipCop MR:
Series([], dtype: int64)


Missing values in GossipCop HR:
Series([], dtype: int64)




In [8]:
# Dropping the NA value
politifact_hr.dropna(subset=['title'], inplace=True)

## Combined df

### Feature Engineering

In [9]:
import pandas as pd

# Assuming your datasets are already loaded as DataFrames
# Assign labels and sources based on your description
datasets = {
    'PolitiFact MF': (politifact_mf, 'PolitiFact', 'LLM Generated', 'Fake'),
    'PolitiFact HF': (politifact_hf, 'PolitiFact', 'Human Written', 'Fake'),
    'PolitiFact MR': (politifact_mr, 'PolitiFact', 'LLM Generated', 'Real'),
    'PolitiFact HR': (politifact_hr, 'PolitiFact', 'Human Written', 'Real'),
    'GossipCop MF': (gossipcop_mf, 'GossipCop', 'LLM Generated', 'Fake'),
    'GossipCop HF': (gossipcop_hf, 'GossipCop', 'Human Written', 'Fake'),
    'GossipCop MR': (gossipcop_mr, 'GossipCop', 'LLM Generated', 'Real'),
    'GossipCop HR': (gossipcop_hr, 'GossipCop', 'Human Written', 'Real')
}

# List to hold all dataframes with additional columns
df_list = []

# Iterate over datasets to add the 'platform', 'source', and 'label' columns
for name, (df, platform, source, label) in datasets.items():
    df = df.copy()  # Avoid modifying the original data
    df['platform'] = platform
    df['source'] = source
    df['label'] = label
    df_list.append(df)

# Concatenate all datasets into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the combined DataFrame to verify
print(combined_df.head())
print(combined_df['platform'].value_counts())
print(combined_df['source'].value_counts())
print(combined_df['label'].value_counts())


                id                                        description  \
0  politifact11773  Republican attacks on transgendered Americans ...   
1  politifact13827  Whoopi Goldberg is in hot water after comments...   
2  politifact13570  Washington, DC — A former Secret Service agent...   
3  politifact14947  Bill Clinton’s hitman has confessed to more th...   
4  politifact14517  Scott&#8217;s prognosis isn&#8217;t good. (via...   

                                                text  \
0  inia State Representative Mark Cole's proposed...   
1  Whoopi Goldberg has found herself in the middl...   
2  A former Secret Service agent has written a ne...   
3  In what appears to be a major blow to the cred...   
4  In a shocking turn of events, Florida Governor...   

                                               title    platform  \
0  Virginia Republican Introduces Controversial B...  PolitiFact   
1  Whoopi Goldberg Faces Backlash for Disrespectf...  PolitiFact   
2  Former Secret Ser

In [10]:
combined_df

Unnamed: 0,id,description,text,title,platform,source,label
0,politifact11773,Republican attacks on transgendered Americans ...,inia State Representative Mark Cole's proposed...,Virginia Republican Introduces Controversial B...,PolitiFact,LLM Generated,Fake
1,politifact13827,Whoopi Goldberg is in hot water after comments...,Whoopi Goldberg has found herself in the middl...,Whoopi Goldberg Faces Backlash for Disrespectf...,PolitiFact,LLM Generated,Fake
2,politifact13570,"Washington, DC — A former Secret Service agent...",A former Secret Service agent has written a ne...,Former Secret Service Agent Exposes Shocking S...,PolitiFact,LLM Generated,Fake
3,politifact14947,Bill Clinton’s hitman has confessed to more th...,In what appears to be a major blow to the cred...,Hannity Issues Retraction on False Story: Bill...,PolitiFact,LLM Generated,Fake
4,politifact14517,Scott&#8217;s prognosis isn&#8217;t good. (via...,"In a shocking turn of events, Florida Governor...",Florida Governor Rick Scott Miraculously Recov...,PolitiFact,LLM Generated,Fake
...,...,...,...,...,...,...,...
21019,gossipcop-875489,From hand-baked clay tiles to LED lights that ...,For free real time breaking news alerts sent s...,The top interior design trends for millennials,GossipCop,Human Written,Real
21020,gossipcop-844263,Gilmore Girls: A Year in the Life made its Net...,Gilmore Girls: A Year in the Life made its Net...,"Gilmore Girls Video: Lauren Graham, Alexis Ble...",GossipCop,Human Written,Real
21021,gossipcop-917467,On Sunday Fox aired “O.J. Simpson: The Lost Co...,Why Is It Airing Now?\n\nAccording to the exec...,"The O.J. Simpson Interview on Fox: Gripping, G...",GossipCop,Human Written,Real
21022,gossipcop-924877,Just when you thought this season of Vanderpum...,Just when you thought this season of Vanderpum...,Kristen Doute and James Kennedy Hooked Up Rumo...,GossipCop,Human Written,Real


#### Combining two columns (for multi class task)

In [11]:
# Create the combined column by concatenating the 'source' and 'label' columns
combined_df['source_label'] = combined_df['source'] + '_' + combined_df['label']

# Display the first few rows to verify the new column
print(combined_df[['source', 'label', 'source_label']].head())

          source label        source_label
0  LLM Generated  Fake  LLM Generated_Fake
1  LLM Generated  Fake  LLM Generated_Fake
2  LLM Generated  Fake  LLM Generated_Fake
3  LLM Generated  Fake  LLM Generated_Fake
4  LLM Generated  Fake  LLM Generated_Fake


In [12]:
combined_df.head()

Unnamed: 0,id,description,text,title,platform,source,label,source_label
0,politifact11773,Republican attacks on transgendered Americans ...,inia State Representative Mark Cole's proposed...,Virginia Republican Introduces Controversial B...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
1,politifact13827,Whoopi Goldberg is in hot water after comments...,Whoopi Goldberg has found herself in the middl...,Whoopi Goldberg Faces Backlash for Disrespectf...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
2,politifact13570,"Washington, DC — A former Secret Service agent...",A former Secret Service agent has written a ne...,Former Secret Service Agent Exposes Shocking S...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
3,politifact14947,Bill Clinton’s hitman has confessed to more th...,In what appears to be a major blow to the cred...,Hannity Issues Retraction on False Story: Bill...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake
4,politifact14517,Scott&#8217;s prognosis isn&#8217;t good. (via...,"In a shocking turn of events, Florida Governor...",Florida Governor Rick Scott Miraculously Recov...,PolitiFact,LLM Generated,Fake,LLM Generated_Fake


#### Text Analysis Before Cleaning

##### Readibility score (before)

In [13]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [14]:
import textstat

# Calculate readability scores before text cleaning
combined_df['description_readability_before'] = combined_df['description'].apply(textstat.flesch_reading_ease)
combined_df['text_readability_before'] = combined_df['text'].apply(textstat.flesch_reading_ease)
combined_df['title_readability_before'] = combined_df['title'].apply(textstat.flesch_reading_ease)

# Display the first few rows to verify the new columns
print(combined_df[['description', 'description_readability_before', 'text', 'text_readability_before', 'title', 'title_readability_before']].head())


                                         description  \
0  Republican attacks on transgendered Americans ...   
1  Whoopi Goldberg is in hot water after comments...   
2  Washington, DC — A former Secret Service agent...   
3  Bill Clinton’s hitman has confessed to more th...   
4  Scott&#8217;s prognosis isn&#8217;t good. (via...   

   description_readability_before  \
0                           15.65   
1                           65.93   
2                           43.74   
3                           68.30   
4                           14.63   

                                                text  text_readability_before  \
0  inia State Representative Mark Cole's proposed...                    36.12   
1  Whoopi Goldberg has found herself in the middl...                    45.19   
2  A former Secret Service agent has written a ne...                    50.57   
3  In what appears to be a major blow to the cred...                    35.95   
4  In a shocking turn of events, Fl

##### Text Length (before)

In [15]:
# Calculate text length before cleaning
combined_df['title_length_before'] = combined_df['title'].apply(len)
combined_df['description_length_before'] = combined_df['description'].apply(len)
combined_df['text_length_before'] = combined_df['text'].apply(len)

## Text Cleaning

In [16]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Ensure you have downloaded the required NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [17]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

def text_preprocessing(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Work on a copy of the text to avoid modifying the original
    processed_text = text.lower()

    # Remove URLs
    processed_text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', '', processed_text)

    # Remove text enclosed in square brackets
    processed_text = re.sub(r'\[.*?\]', '', processed_text)

    # Remove text enclosed in angle brackets
    processed_text = re.sub(r'<.*?>+', '', processed_text)

    # Remove punctuation characters
    processed_text = re.sub(r'[%s]' % re.escape(string.punctuation), '', processed_text)

    # Remove newline characters
    processed_text = re.sub(r'\n', '', processed_text)

    # Remove substrings of digits surrounded by word characters
    processed_text = re.sub(r'\w*\d\w*', '', processed_text)

    # Remove emails
    processed_text = re.sub(r'@\S+', '', processed_text)

    # Remove numbers
    processed_text = re.sub(r'\b\d+\b', '', processed_text)

    # Remove repeating characters
    processed_text = re.sub(r'(.)\1+', r'\1', processed_text)

    # Tokenize, remove stopwords, stem, and lemmatize
    words = word_tokenize(processed_text)
    words = [word for word in words if word not in stop_words and len(word) > 1]
    words = [stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the processed words back into a single string
    processed_text = " ".join(words)

    return processed_text

### Apply the text_preprocessing function on title, description, and text columns

In [18]:
# Apply the cleaning function on title, description, and text columns
combined_df['cleaned_description'] = combined_df['description'].apply(text_preprocessing)
combined_df['cleaned_text'] = combined_df['text'].apply(text_preprocessing)
combined_df['cleaned_title'] = combined_df['title'].apply(text_preprocessing)

## Text Length (after)

In [19]:
# Calculate text length before cleaning
combined_df['title_length_after'] = combined_df['title'].apply(len)
combined_df['description_length_after'] = combined_df['description'].apply(len)
combined_df['text_length_after'] = combined_df['text'].apply(len)

## Sentiment Analysis

### Sentiment (before)

In [20]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

combined_df['title_sentiment'] = combined_df['title'].apply(get_sentiment)
combined_df['text_sentiment'] = combined_df['text'].apply(get_sentiment)
combined_df['description_sentiment'] = combined_df['description'].apply(get_sentiment)

### Sentiment (After)

In [21]:
# Apply sentiment analysis on cleaned data
combined_df['cleaned_title_sentiment'] = combined_df['cleaned_title'].apply(get_sentiment)
combined_df['cleaned_text_sentiment'] = combined_df['cleaned_text'].apply(get_sentiment)
combined_df['cleaned_description_sentiment'] = combined_df['cleaned_description'].apply(get_sentiment)


## Readability Score (After)

In [22]:
# Calculate readability scores after text cleaning
combined_df['description_readability_after'] = combined_df['cleaned_description'].apply(textstat.flesch_reading_ease)
combined_df['text_readability_after'] = combined_df['cleaned_text'].apply(textstat.flesch_reading_ease)
combined_df['title_readability_after'] = combined_df['cleaned_title'].apply(textstat.flesch_reading_ease)

In [23]:
combined_df.columns

Index(['id', 'description', 'text', 'title', 'platform', 'source', 'label',
       'source_label', 'description_readability_before',
       'text_readability_before', 'title_readability_before',
       'title_length_before', 'description_length_before',
       'text_length_before', 'cleaned_description', 'cleaned_text',
       'cleaned_title', 'title_length_after', 'description_length_after',
       'text_length_after', 'title_sentiment', 'text_sentiment',
       'description_sentiment', 'cleaned_title_sentiment',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'description_readability_after', 'text_readability_after',
       'title_readability_after'],
      dtype='object')

## Model Preparation

#### Scaling (Standardisation)

In [24]:
# Import the StandardScaler class from scikit-learn
from sklearn.preprocessing import StandardScaler

# List of columns in the DataFrame that need to be standardized
columns_to_standardize = [
    'text_length_before',  # Length of text before cleaning
    'description_length_before',  # Length of description before cleaning
    'title_length_before',  # Length of title before cleaning
    'text_length_after',  # Length of text after cleaning
    'description_length_after',  # Length of description after cleaning
    'title_length_after',  # Length of title after cleaning
    'text_readability_before',  # Readability score of text before cleaning
    'description_readability_before',  # Readability score of description before cleaning
    'title_readability_before',  # Readability score of title before cleaning
    'cleaned_text_sentiment',  # Sentiment score of cleaned text
    'cleaned_description_sentiment',  # Sentiment score of cleaned description
    'cleaned_title_sentiment'  # Sentiment score of cleaned title
]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the selected columns and transform them
combined_df[columns_to_standardize] = scaler.fit_transform(combined_df[columns_to_standardize])

# Print the first few rows of the standardized columns to verify the transformation
print(combined_df[columns_to_standardize].head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

## Split the daraframe

In [25]:
# Split the combined_df dataframe by platform
# Create a separate dataframe for each platform
gossipcop_df = combined_df[combined_df['platform'] == 'GossipCop']
politifact_df = combined_df[combined_df['platform'] == 'PolitiFact']

# Print the shapes of the new dataframes
print(f"GossipCop dataframe shape: {gossipcop_df.shape}")
print(f"PolitiFact dataframe shape: {politifact_df.shape}")

# Print the shape of the combined dataframe
print(f"Combined dataframe shape: {combined_df.shape}")

GossipCop dataframe shape: (20505, 29)
PolitiFact dataframe shape: (519, 29)
Combined dataframe shape: (21024, 29)


In [26]:
combined_df.columns

Index(['id', 'description', 'text', 'title', 'platform', 'source', 'label',
       'source_label', 'description_readability_before',
       'text_readability_before', 'title_readability_before',
       'title_length_before', 'description_length_before',
       'text_length_before', 'cleaned_description', 'cleaned_text',
       'cleaned_title', 'title_length_after', 'description_length_after',
       'text_length_after', 'title_sentiment', 'text_sentiment',
       'description_sentiment', 'cleaned_title_sentiment',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'description_readability_after', 'text_readability_after',
       'title_readability_after'],
      dtype='object')

### Dropping the columns

In [27]:
# List of columns to keep, which includes the scaled columns and the 'source_label' column
columns_to_keep = columns_to_standardize + ['label', 'source', 'cleaned_text']

# Create a new DataFrame with only the selected columns
combined_df1 = combined_df[columns_to_keep]
gossipcop_df1 = gossipcop_df[columns_to_keep]
politifact_df1 = politifact_df[columns_to_keep]

# Print the first few rows of the new DataFrame to verify
print(combined_df1.head())
print(gossipcop_df1.head())
print(politifact_df1.head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

### One hot Encoding

In [28]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'label', 'source', 'cleaned_text'],
      dtype='object')

In [29]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False, dtype=int)  # Setting dtype=int to ensure integer output

# Fit and transform both 'label' and 'source' columns
encoded_columns = encoder.fit_transform(combined_df1[['label', 'source']])

# Convert the encoded columns into a DataFrame with proper column names
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['label', 'source']))

# Concatenate the encoded columns back to the original dataframe
combined_df1 = pd.concat([combined_df1, encoded_df], axis=1)

# Optionally, drop the original 'label' and 'source' columns
combined_df1 = combined_df1.drop(['label', 'source'], axis=1)



In [30]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'cleaned_text', 'label_Fake', 'label_Real',
       'source_Human Written', 'source_LLM Generated'],
      dtype='object')

In [31]:
# Dropping the columns related to the target
combined_df1 = combined_df1.drop(['label_Real','source_Human Written'], axis=1)

In [32]:
combined_df1.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment', 'cleaned_description_sentiment',
       'cleaned_title_sentiment', 'cleaned_text', 'label_Fake',
       'source_LLM Generated'],
      dtype='object')

### TF-IDF

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Adjust max_features as needed

# Apply TF-IDF on the 'cleaned_text' column
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df1['cleaned_text'])

# Convert the TF-IDF matrix to a DataFrame for easier handling
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine the TF-IDF features with the original DataFrame (excluding 'cleaned_text')
combined_df1_tfidf = pd.concat([combined_df1.drop('cleaned_text', axis=1).reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the updated DataFrame
print(combined_df1_tfidf.head())

   text_length_before  description_length_before  title_length_before  \
0           -0.158926                   2.020107             0.004766   
1           -0.330207                   1.981057            -0.007190   
2           -0.239516                   1.239106            -0.063983   
3           -0.335580                   4.158095             0.034656   
4           -0.269173                  -0.654819             0.013733   

   text_length_after  description_length_after  title_length_after  \
0          -0.158926                  2.020107            0.004766   
1          -0.330207                  1.981057           -0.007190   
2          -0.239516                  1.239106           -0.063983   
3          -0.335580                  4.158095            0.034656   
4          -0.269173                 -0.654819            0.013733   

   text_readability_before  description_readability_before  \
0                -2.340435                       -2.131953   
1               

In [34]:
combined_df1_tfidf.shape

(21024, 1014)

In [35]:
combined_df1_tfidf.columns

Index(['text_length_before', 'description_length_before',
       'title_length_before', 'text_length_after', 'description_length_after',
       'title_length_after', 'text_readability_before',
       'description_readability_before', 'title_readability_before',
       'cleaned_text_sentiment',
       ...
       'world', 'worth', 'write', 'wrong', 'wrote', 'ye', 'year', 'york',
       'young', 'youtub'],
      dtype='object', length=1014)

##### Distribution of the target column

In [36]:
class_distribution = combined_df1['label_Fake'].value_counts()
print(class_distribution)

label_Fake
0    12662
1     8362
Name: count, dtype: int64


## Dataset Splitting

In [37]:
from sklearn.model_selection import train_test_split

df = combined_df1_tfidf

# Define X (features) and y (target)
X = df.drop('label_Fake', axis=1)  # Features (all columns except the target)
y = df['label_Fake']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (14716, 1013)
X_test shape: (6308, 1013)
y_train shape: (14716,)
y_test shape: (6308,)


### Balancing the training set

In [38]:
from collections import Counter

# Check the distribution of classes in y_train
class_distribution = Counter(y_train)
print(f"Original class distribution in y_train: {class_distribution}")

Original class distribution in y_train: Counter({0: 8863, 1: 5853})


In [39]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print(f"Resampled target distribution: {Counter(y_train_resampled)}")

Resampled target distribution: Counter({0: 8863, 1: 8863})


# The Machine Learning Models

## Train the SVM Model

In [40]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize the Support Vector Classifier
svm_model = SVC(random_state=42)

# Fit the model to the resampled training data
svm_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model with metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.855580215599239
Precision: 0.8573345259391771
Recall: 0.7640494220805102
F1 Score: 0.8080084299262381


## Tuning the SVM Model using GridSearchCV

In [41]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for tuning
param_grid = {
    'C': [1, 0.1],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the grid search
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Get the best cross-validation score
best_cv_score = grid_search.best_score_
print(f"Best cross-validation F1 score: {best_cv_score}")

# Make predictions using the best model
best_svm_model = grid_search.best_estimator_
y_pred_best = best_svm_model.predict(X_test)

# Re-evaluate the model with the best parameters
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print(f"Best Accuracy: {accuracy_best}")
print(f"Best Precision: {precision_best}")
print(f"Best Recall: {recall_best}")
print(f"Best F1 Score: {f1_best}")

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best cross-validation F1 score: 0.8439820516384691
Best Accuracy: 0.855580215599239
Best Precision: 0.8573345259391771
Best Recall: 0.7640494220805102
Best F1 Score: 0.8080084299262381


## Evaluate Fairness Metrics

In [42]:
pip install fairlearn

Collecting fairlearn
  Downloading fairlearn-0.10.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.10.0-py3-none-any.whl (234 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/234.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m234.1/234.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.10.0


In [43]:
from fairlearn.metrics import demographic_parity_difference

# Assuming source_LLM_Generated is in your test set
sensitive_attribute = X_test['source_LLM Generated']

dpd = demographic_parity_difference(y_test, y_pred_best, sensitive_features=sensitive_attribute)
print(f"Demographic Parity Difference: {dpd}")

Demographic Parity Difference: 0.250526028351923


In [44]:
from fairlearn.metrics import equalized_odds_difference

eod = equalized_odds_difference(y_test, y_pred_best, sensitive_features=sensitive_attribute)
print(f"Equalized Odds Difference: {eod}")

Equalized Odds Difference: 0.33539507673219493


In [48]:
from fairlearn.metrics import true_positive_rate_difference

# Assuming source_LLM_Generated is in your test set
sensitive_attribute = X_test['source_LLM Generated']

# Calculate Equal Opportunity Difference (True Positive Rate Difference)
eop_diff = true_positive_rate_difference(y_test, y_pred_best, sensitive_features=sensitive_attribute)
print(f"Equal Opportunity Difference: {eop_diff}")

Equal Opportunity Difference: 0.33539507673219493


In [45]:
import pandas as pd

# Create a dataframe for easier group-based evaluation
results_df = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred_best, 'sensitive_attr': sensitive_attribute})

# Calculate the metrics for each group
group_metrics = {}
for group in results_df['sensitive_attr'].unique():
    group_data = results_df[results_df['sensitive_attr'] == group]
    accuracy_group = accuracy_score(group_data['y_true'], group_data['y_pred'])
    precision_group = precision_score(group_data['y_true'], group_data['y_pred'])
    recall_group = recall_score(group_data['y_true'], group_data['y_pred'])
    f1_group = f1_score(group_data['y_true'], group_data['y_pred'])

    group_metrics[group] = {
        'Accuracy': accuracy_group,
        'Precision': precision_group,
        'Recall': recall_group,
        'F1': f1_group
    }

# Calculate group differences
accuracy_group_diff = max([v['Accuracy'] for v in group_metrics.values()]) - min([v['Accuracy'] for v in group_metrics.values()])
precision_group_diff = max([v['Precision'] for v in group_metrics.values()]) - min([v['Precision'] for v in group_metrics.values()])
recall_group_diff = max([v['Recall'] for v in group_metrics.values()]) - min([v['Recall'] for v in group_metrics.values()])
f1_group_diff = max([v['F1'] for v in group_metrics.values()]) - min([v['F1'] for v in group_metrics.values()])

print(f"Accuracy Group Difference: {accuracy_group_diff}")
print(f"Precision Group Difference: {precision_group_diff}")
print(f"Recall Group Difference: {recall_group_diff}")
print(f"F1 Group Difference: {f1_group_diff}")


Accuracy Group Difference: 0.12252691288367401
Precision Group Difference: 0.1710509567652425
Recall Group Difference: 0.33539507673219493
F1 Group Difference: 0.2634686309942621


## Apply Adversarial Debiasing

In [46]:
pip install aif360

Collecting aif360
  Downloading aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/259.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aif360
Successfully installed aif360-0.6.1


In [47]:
from aif360.algorithms.inprocessing import AdversarialDebiasing
from aif360.datasets import BinaryLabelDataset
import tensorflow as tf
from sklearn.metrics import accuracy_score
from fairlearn.metrics import (demographic_parity_difference, equalized_odds_difference,
                               true_positive_rate_difference, false_positive_rate_difference,
                               false_negative_rate_difference)

# Convert your data into the AIF360 dataset format
dataset_train = BinaryLabelDataset(df=pd.concat([X_train_resampled, y_train_resampled], axis=1),
                                   label_names=['label_Fake'],
                                   protected_attribute_names=['source_LLM Generated'])

# Apply adversarial debiasing
# Enable TensorFlow 1.x compatibility mode
tf.compat.v1.disable_eager_execution()

# Create a TensorFlow session using the compatible method
sess = tf.compat.v1.Session()

# Note: No 'protected_attribute_names' argument is needed
adversarial_model = AdversarialDebiasing(privileged_groups=[{'source_LLM Generated': 1}],
                                         unprivileged_groups=[{'source_LLM Generated': 0}],
                                         scope_name='debiasing_classifier',
                                         debias=True,
                                         sess=sess)

# Fit the model
adversarial_model.fit(dataset_train)

# Make predictions and evaluate the debiased model
dataset_test = BinaryLabelDataset(df=pd.concat([X_test, y_test], axis=1),
                                  label_names=['label_Fake'],
                                  protected_attribute_names=['source_LLM Generated'])

debiased_predictions = adversarial_model.predict(dataset_test)

# Extract the debiased predictions and true labels
y_pred_debiased = debiased_predictions.labels.ravel()  # Predictions from the debiased model
y_true = dataset_test.labels.ravel()  # True labels

# Sensitive feature
sensitive_features = X_test['source_LLM Generated']

# Overall accuracy
overall_accuracy = accuracy_score(y_true, y_pred_debiased)
print(f"Overall Accuracy: {overall_accuracy:.4f}")

# Fairness metrics
dp_diff = demographic_parity_difference(y_true, y_pred_debiased, sensitive_features=sensitive_features)
print("Demographic Parity Difference:", dp_diff)

eo_diff = equalized_odds_difference(y_true, y_pred_debiased, sensitive_features=sensitive_features)
print("Equalized Odds Difference:", eo_diff)

tpr_diff = true_positive_rate_difference(y_true, y_pred_debiased, sensitive_features=sensitive_features)
print("Equal Opportunity Difference (TPR):", tpr_diff)

fpr_diff = false_positive_rate_difference(y_true, y_pred_debiased, sensitive_features=sensitive_features)
print("False Positive Rate Difference:", fpr_diff)

fnr_diff = false_negative_rate_difference(y_true, y_pred_debiased, sensitive_features=sensitive_features)
print("False Negative Rate Difference:", fnr_diff)

pip install 'aif360[inFairness]'
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


epoch 0; iter: 0; batch classifier loss: 0.717024; batch adversarial loss: 0.678507
epoch 1; iter: 0; batch classifier loss: 0.318547; batch adversarial loss: 0.667179
epoch 2; iter: 0; batch classifier loss: 0.284559; batch adversarial loss: 0.642345
epoch 3; iter: 0; batch classifier loss: 0.231801; batch adversarial loss: 0.646956
epoch 4; iter: 0; batch classifier loss: 0.298731; batch adversarial loss: 0.622940
epoch 5; iter: 0; batch classifier loss: 0.241356; batch adversarial loss: 0.642276
epoch 6; iter: 0; batch classifier loss: 0.255046; batch adversarial loss: 0.627487
epoch 7; iter: 0; batch classifier loss: 0.176921; batch adversarial loss: 0.633664
epoch 8; iter: 0; batch classifier loss: 0.232339; batch adversarial loss: 0.657970
epoch 9; iter: 0; batch classifier loss: 0.236841; batch adversarial loss: 0.626328
epoch 10; iter: 0; batch classifier loss: 0.192540; batch adversarial loss: 0.604317
epoch 11; iter: 0; batch classifier loss: 0.144220; batch adversarial loss:

In [50]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Overall Accuracy
accuracy_debiased = accuracy_score(y_true, y_pred_debiased)
print(f"Accuracy: {accuracy_debiased:.4f}")

# Overall Precision (weighted to handle class imbalance)
precision_debiased = precision_score(y_true, y_pred_debiased, average='weighted')
print(f"Precision: {precision_debiased:.4f}")

# Overall Recall (weighted to handle class imbalance)
recall_debiased = recall_score(y_true, y_pred_debiased, average='weighted')
print(f"Recall: {recall_debiased:.4f}")

# Overall F1-Score (weighted to handle class imbalance)
f1_debiased = f1_score(y_true, y_pred_debiased, average='weighted')
print(f"F1-Score: {f1_debiased:.4f}")

Accuracy: 0.8464
Precision: 0.8472
Recall: 0.8464
F1-Score: 0.8467


In [49]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from fairlearn.metrics import MetricFrame

# Define metrics for evaluation
metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

# Create a MetricFrame to evaluate the debiased model's performance by group
metric_frame = MetricFrame(
    metrics=metrics,
    y_true=y_true,
    y_pred=y_pred_debiased,
    sensitive_features=sensitive_features
)

# Display overall group-wise metrics
print("Fairness Metrics by Group for Debiased Model:")
print(metric_frame.by_group)

# Calculate differences between groups
accuracy_diff = metric_frame.difference(method='between_groups')['accuracy']
precision_diff = metric_frame.difference(method='between_groups')['precision']
recall_diff = metric_frame.difference(method='between_groups')['recall']
f1_diff = metric_frame.difference(method='between_groups')['f1']

# Print the differences for each metric
print(f"Accuracy Difference: {accuracy_diff:.4f}")
print(f"Precision Difference: {precision_diff:.4f}")
print(f"Recall Difference: {recall_diff:.4f}")
print(f"F1-Score Difference: {f1_diff:.4f}")

Fairness Metrics by Group for Debiased Model:
                      accuracy  precision    recall        f1
source_LLM Generated                                         
0                     0.783562   0.656796  0.702110  0.678698
1                     0.940571   0.947577  0.934221  0.940852
Accuracy Difference: 0.1570
Precision Difference: 0.2908
Recall Difference: 0.2321
F1-Score Difference: 0.2622
