Data Ingestion and Aggregation

This script pulls together data from multiple sources and combines them into the final sets used in the project for training and testing our models. The final datasets can be found in the following GoogleDrive: https://drive.google.com/drive/folders/13H72zfO7sIyxJa0QOCxKjza3dZhdyBhn

In [398]:
import numpy as np
import pandas as pd

import os
import unicodedata
import string
import re
import json

from sklearn.model_selection import train_test_split

In [16]:
file_path = '/Users/alex_fassone/Documents/MSc Statistics/ST456/Coursework/Project/Data'

LIAR


Information - https://datasets.activeloop.ai/docs/ml/datasets/liar-dataset/#:~:text=,email%20protected
Data Download - https://www.cs.ucsb.edu/~william/data/liar_dataset.zip

In [20]:
file_path_liar = file_path + '/Liar'

In [21]:
# Step 2: Load TSV into Pandas DataFrame
def load_liar_split(filename):
    path = os.path.join(file_path_liar, filename)
    columns = [
        "id", "label", "statement", "subject", "speaker", "job_title",
        "state_info", "party_affiliation", "barely_true_counts",
        "false_counts", "half_true_counts", "mostly_true_counts",
        "pants_on_fire_counts", "context"
    ]
    df = pd.read_csv(path, sep="\t", names=columns)
    return df

In [22]:
liar_train_df = load_liar_split("train.tsv")
liar_test_df = load_liar_split("test.tsv")
liar_val_df = load_liar_split("valid.tsv")

In [123]:
len(liar_val_df)/(len(liar_test_df) + len(liar_train_df) + len(liar_val_df))

0.1003830818544289

In [23]:
# Combine all three into a single DataFrame
liar_combined_df = pd.concat([liar_train_df, liar_test_df, liar_val_df], ignore_index=True)

# Show the first few rows
print(liar_combined_df.head())

           id        label                                          statement  \
0   2635.json        false  Says the Annies List political group supports ...   
1  10540.json    half-true  When did the decline of coal start? It started...   
2    324.json  mostly-true  Hillary Clinton agrees with John McCain "by vo...   
3   1123.json        false  Health care reform legislation is likely to ma...   
4   9028.json    half-true  The economic turnaround started at the end of ...   

                              subject         speaker             job_title  \
0                            abortion    dwayne-bohac  State representative   
1  energy,history,job-accomplishments  scott-surovell        State delegate   
2                      foreign-policy    barack-obama             President   
3                         health-care    blog-posting                   NaN   
4                        economy,jobs   charlie-crist                   NaN   

  state_info party_affiliation  barely

In [24]:
# Define mapping function
def binary_label(label):
    if label in ["true", "mostly-true", "half-true"]:
        return 1
    else:
        return 0

# Apply it to create a new column
liar_combined_df["binary_label"] = liar_combined_df["label"].apply(binary_label)

# Show the result
print(liar_combined_df[["label", "binary_label"]].head())

         label  binary_label
0        false             0
1    half-true             1
2  mostly-true             1
3        false             0
4    half-true             1


In [25]:
# Add a new column called 'dataset' with constant value 'LIAR'
liar_combined_df["dataset"] = "LIAR"

# Rename 'statement' to 'body'
liar_combined_df.rename(columns={"statement": "body"}, inplace=True)

# Create a new DataFrame with only the requested columns
liar_simple_df = liar_combined_df[["dataset", "body", "binary_label"]]

# Show the result
print(liar_simple_df.head())
print(len(liar_simple_df))

  dataset                                               body  binary_label
0    LIAR  Says the Annies List political group supports ...             0
1    LIAR  When did the decline of coal start? It started...             1
2    LIAR  Hillary Clinton agrees with John McCain "by vo...             1
3    LIAR  Health care reform legislation is likely to ma...             0
4    LIAR  The economic turnaround started at the end of ...             1
12791


LIAR 2 

Information - https://paperswithcode.com/dataset/liar2

In [26]:
file_path_liar_2 = file_path + '/Liar 2'

In [27]:
def load_liar_2_split(filename):
    path = os.path.join(file_path_liar_2, filename)
    columns = [
        "id", "label", "statement", "date", "subject", "speaker", "speaker_description",
        "state_info", "true_counts", "mostly_true_counts", "half_true_counts",  
        "mostly_false_counts", "false_counts", "pants_on_fire_counts", "context", "justification"
    ]
    df = pd.read_csv(path, names=columns, header=0)  # Use default comma separator
    return df

In [28]:
liar_2_train_df = load_liar_2_split("train.csv")
liar_2_test_df = load_liar_2_split("test.csv")
liar_2_val_df = load_liar_2_split("valid.csv")

# Combine all three into a single DataFrame
liar_2_combined_df = pd.concat([liar_2_train_df, liar_2_test_df, liar_2_val_df], ignore_index=True)

# Show the first few rows
print(liar_2_combined_df.head())

      id  label                                          statement  \
0  13847      5  90 percent of Americans "support universal bac...   
1  13411      1  Last year was one of the deadliest years ever ...   
2  10882      0  Bernie Sanders's plan is "to raise your taxes ...   
3  20697      4  Voter ID is supported by an overwhelming major...   
4   6095      2  Says Barack Obama "robbed Medicare (of) $716 b...   

               date                                            subject  \
0   October 2, 2017  government regulation;polls and public opinion...   
1      May 19, 2017  after the fact;congress;criminal justice;histo...   
2  October 28, 2015                                              taxes   
3  December 8, 2021                                      voter id laws   
4   August 12, 2012         federal budget;history;medicare;retirement   

          speaker                                speaker_description  \
0     chris abele  Chris Abele is Milwaukee County Executive, 

In [30]:
# Define mapping function
def binary_label_liar_2(label):
    if label in [0, 1, 2]:
        return 0
    else:
        return 1

# Apply it to create a new column
liar_2_combined_df["binary_label"] = liar_2_combined_df["label"].apply(binary_label_liar_2)

# Show the result
print(liar_2_combined_df[["label", "binary_label"]].head())

   label  binary_label
0      5             1
1      1             0
2      0             0
3      4             1
4      2             0


In [290]:
# Add a new column called 'dataset' with constant value 'LIAR'
liar_2_combined_df["dataset"] = "LIAR 2"

# Rename 'statement' to 'body'
liar_2_combined_df.rename(columns={"statement": "body"}, inplace=True)

# Create a new DataFrame with only the requested columns
liar_2_simple_df = liar_2_combined_df[["dataset", "body", "binary_label", "label"]]

# Show the result
print(liar_2_simple_df.head())
print(len(liar_2_simple_df))

  dataset                                               body  binary_label  \
0  LIAR 2  90 percent of Americans "support universal bac...             1   
1  LIAR 2  Last year was one of the deadliest years ever ...             0   
2  LIAR 2  Bernie Sanders's plan is "to raise your taxes ...             0   
3  LIAR 2  Voter ID is supported by an overwhelming major...             1   
4  LIAR 2  Says Barack Obama "robbed Medicare (of) $716 b...             0   

   label  
0      5  
1      1  
2      0  
3      4  
4      2  
22962


Fakeddit

Information - download here https://drive.google.com/drive/folders/1qYgeupmblRZDsUNaasJEtIJ6Sv-PEOlF

In [66]:
file_path_fakeddit = file_path + '/Fakeddit'

In [72]:
def load_fakeddit_split(filename):
    path = os.path.join(file_path_fakeddit, filename)
    df = pd.read_csv(path, sep="\t")
    return df

In [87]:
fakeddit_train_df = load_fakeddit_split("all_train.tsv")
#fakeddit_test_df = load_fakeddit_split("all_test_public.tsv")
#fakeddit_val_df = load_fakeddit_split("all_validate.tsv")

# Combine all three into a single DataFrame
fakeddit_combined_df = pd.concat([fakeddit_train_df], ignore_index=True)

fakeddit_combined_df = fakeddit_combined_df[["id", "clean_title", "2_way_label"]]

# Show the first few rows
print(fakeddit_combined_df.head())

        id                                        clean_title  2_way_label
0   awxhir  my walgreens offbrand mucinex was engraved wit...            1
1  cvm5uy4                                                NaN            0
2   98pbid                this concerned sink with a tiny hat            0
3   6f2cy5      hackers leak emails from uae ambassador to us            1
4  cc5cbon                                                NaN            0


In [89]:
# Add a new column called 'dataset' with constant value 'LIAR'
fakeddit_combined_df["dataset"] = "Fakeddit"

# Rename 'statement' to 'body'
fakeddit_combined_df.rename(columns={"clean_title": "title"}, inplace=True)
fakeddit_combined_df.rename(columns={"2_way_label": "binary_label"}, inplace=True)

# Create a new DataFrame with only the requested columns
fakeddit_simple_df = fakeddit_combined_df[["dataset", "title", "binary_label"]]

fakeddit_simple_df = fakeddit_simple_df.dropna(subset=['title'])

# Show the result
print(fakeddit_simple_df.head())
print(len(fakeddit_simple_df))

    dataset                                              title  binary_label
0  Fakeddit  my walgreens offbrand mucinex was engraved wit...             1
2  Fakeddit                this concerned sink with a tiny hat             0
3  Fakeddit      hackers leak emails from uae ambassador to us             1
5  Fakeddit                     this flower in my neighborhood             1
6  Fakeddit                           puppy taking in the view             1
802789


Kaggle 1 - Fake News Dataset

Information - https://www.kaggle.com/datasets/abaghyangor/fake-news-dataset/data

In [35]:
file_path_kaggle_1_fake_news = file_path + '/Kaggle-1-Fake-News-Dataset'

In [36]:
def load_kaggle_1_fake_news_split(filename):
    path = os.path.join(file_path_kaggle_1_fake_news, filename)
    columns = ["title", "text", "subject", "date"]
    df = pd.read_csv(path, names=columns, header=0)  # Use default comma separator
    return df
    

In [37]:
kaggle_1_fake_news_true_df = load_kaggle_1_fake_news_split("True.csv")
kaggle_1_fake_news_false_df = load_kaggle_1_fake_news_split("Fake.csv")

In [38]:
kaggle_1_fake_news_true_df["binary_label"] = 1
kaggle_1_fake_news_false_df["binary_label"] = 0


In [39]:
# Combine all three into a single DataFrame
kaggle_1_fake_news_combined_df = pd.concat([kaggle_1_fake_news_true_df, kaggle_1_fake_news_false_df], ignore_index=True)

# Rename 'text' to 'body'
kaggle_1_fake_news_combined_df.rename(columns={"text": "body"}, inplace=True)

kaggle_1_fake_news_combined_df["dataset"] = 'Kaggle 1 - Fake News'

# Create a new DataFrame with only the requested columns
kaggle_1_fake_news_simple_df = kaggle_1_fake_news_combined_df[["dataset", "title", "body", "binary_label"]]

# Show the result
print(kaggle_1_fake_news_simple_df.head())
print(len(kaggle_1_fake_news_simple_df))

                dataset                                              title  \
0  Kaggle 1 - Fake News  As U.S. budget fight looms, Republicans flip t...   
1  Kaggle 1 - Fake News  U.S. military to accept transgender recruits o...   
2  Kaggle 1 - Fake News  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  Kaggle 1 - Fake News  FBI Russia probe helped by Australian diplomat...   
4  Kaggle 1 - Fake News  Trump wants Postal Service to charge 'much mor...   

                                                body  binary_label  
0  WASHINGTON (Reuters) - The head of a conservat...             1  
1  WASHINGTON (Reuters) - Transgender people will...             1  
2  WASHINGTON (Reuters) - The special counsel inv...             1  
3  WASHINGTON (Reuters) - Trump campaign adviser ...             1  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...             1  
44898


Kaggle 2 - News Project

Information - https://www.kaggle.com/datasets/antonioskokiantonis/newscsv/data

In [40]:
file_path_kaggle_2_news_project = file_path + '/Kaggle-2-News-Project'

In [41]:
def load_kaggle_2_news_project_split(filename):
    path = os.path.join(file_path_kaggle_2_news_project, filename)
    columns = ["id", "title", "text", "label"]
    df = pd.read_csv(path, names=columns, header=0)  # Use default comma separator
    return df

In [42]:
kaggle_2_news_project_df = load_kaggle_2_news_project_split("news.csv")

In [43]:
# Define mapping function
def binary_label_kaggle_2_news_project(label):
    if label in ["REAL"]:
        return 1
    else:
        return 0

# Apply it to create a new column
kaggle_2_news_project_df["binary_label"] = kaggle_2_news_project_df["label"].apply(binary_label_kaggle_2_news_project)


In [44]:
kaggle_2_news_project_df["dataset"] = 'Kaggle 2 - News Project'

kaggle_2_news_project_df.rename(columns={"text": "body"}, inplace=True)

# Create a new DataFrame with only the requested columns
kaggle_2_news_project_simple_df = kaggle_2_news_project_df[["dataset", "title", "body", "binary_label"]]

# Show the result
print(kaggle_2_news_project_simple_df.head())
print(len(kaggle_2_news_project_simple_df))

                   dataset                                              title  \
0  Kaggle 2 - News Project                       You Can Smell Hillary’s Fear   
1  Kaggle 2 - News Project  Watch The Exact Moment Paul Ryan Committed Pol...   
2  Kaggle 2 - News Project        Kerry to go to Paris in gesture of sympathy   
3  Kaggle 2 - News Project  Bernie supporters on Twitter erupt in anger ag...   
4  Kaggle 2 - News Project   The Battle of New York: Why This Primary Matters   

                                                body  binary_label  
0  Daniel Greenfield, a Shillman Journalism Fello...             0  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...             0  
2  U.S. Secretary of State John F. Kerry said Mon...             1  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...             0  
4  It's primary day in New York and front-runners...             1  
6335


Kaggle 3 - Fake News Detection 

Information - https://www.kaggle.com/datasets/jruvika/fake-news-detection/data

In [45]:
file_path_kaggle_3_fake_news_detection = file_path + '/Kaggle-3-Fake-News-Detection'

In [46]:
def load_kaggle_3_fake_news_detection_split(filename):
    path = os.path.join(file_path_kaggle_3_fake_news_detection, filename)
    columns = ["url", "title", "body", "binary_label"]
    df = pd.read_csv(path, names=columns, header=0)  # Use default comma separator
    return df

In [48]:
kaggle_3_fake_news_detection_df = load_kaggle_3_fake_news_detection_split('data.csv')

In [51]:
kaggle_3_fake_news_detection_df['dataset'] = 'Kaggle 3 - Fake News Detection'

In [52]:
kaggle_3_fake_news_detection_simple_df = kaggle_3_fake_news_detection_df[["dataset", "title", "body", "binary_label"]]

Combine all datasets

In [291]:
combined_df = pd.concat(
    [
        liar_2_simple_df,
        fakeddit_simple_df, 
        kaggle_1_fake_news_simple_df,
        kaggle_2_news_project_simple_df, 
        kaggle_3_fake_news_detection_simple_df
    ], 
    ignore_index=True)

In [134]:
def clean_text(text):
    if pd.isnull(text):
        return ""

    # Normalize unicode (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [292]:
title_df = combined_df[combined_df['title'].notna()].reset_index(drop=True)
title_df = title_df[title_df['title'].str.split().str.len() > 1].reset_index(drop=True)
title_df['text'] = title_df['title']
title_df['text_clean'] = title_df['title'].apply(clean_text)
title_df['title'] = True
title_df['chunk_id'] = 1

In [293]:
title_df = title_df[['dataset', 'text', 'text_clean', 'chunk_id', 'title', 'binary_label', 'label']]

In [294]:
title_df

Unnamed: 0,dataset,text,text_clean,chunk_id,title,binary_label,label
0,Fakeddit,my walgreens offbrand mucinex was engraved wit...,my walgreens offbrand mucinex was engraved wit...,1,True,1,
1,Fakeddit,this concerned sink with a tiny hat,this concerned sink with a tiny hat,1,True,0,
2,Fakeddit,hackers leak emails from uae ambassador to us,hackers leak emails from uae ambassador to us,1,True,1,
3,Fakeddit,this flower in my neighborhood,this flower in my neighborhood,1,True,1,
4,Fakeddit,puppy taking in the view,puppy taking in the view,1,True,1,
...,...,...,...,...,...,...,...
819511,Kaggle 3 - Fake News Detection,Trends to Watch,trends to watch,1,True,0,
819512,Kaggle 3 - Fake News Detection,Trump Jr. Is Soon To Give A 30-Minute Speech F...,trump jr is soon to give a 30minute speech for...,1,True,0,
819513,Kaggle 3 - Fake News Detection,"Ron Paul on Trump, Anarchism & the AltRight",ron paul on trump anarchism the altright,1,True,0,
819514,Kaggle 3 - Fake News Detection,China to accept overseas trial data in bid to ...,china to accept overseas trial data in bid to ...,1,True,1,


In [295]:
title_summary_df = (
    title_df
    .groupby('dataset')
    .agg(
        num_samples = ('text_clean', 'count'),
        proportion_equals_1 = ('text_clean', lambda x: (x.str.split().str.len() == 1).mean()),
        proportion_over_64 = ('text_clean', lambda x: (x.str.split().str.len() > 64).mean()),
        proportion_over_128 = ('text_clean', lambda x: (x.str.split().str.len() > 128).mean()),
        avg_string_length = ('text_clean', lambda x: x.str.split().str.len().mean()),
        proportion_title_present = ('title', lambda x: x.mean())
    )
    .reset_index()
)

title_summary_df

Unnamed: 0,dataset,num_samples,proportion_equals_1,proportion_over_64,proportion_over_128,avg_string_length,proportion_title_present
0,Fakeddit,764299,0.0,3.1e-05,4e-06,8.448797,1.0
1,Kaggle 1 - Fake News,44888,0.0,0.0,0.0,12.424612,1.0
2,Kaggle 2 - News Project,6329,0.000158,0.0,0.0,10.383631,1.0
3,Kaggle 3 - Fake News Detection,4000,0.0,0.0005,0.0,9.6845,1.0


In [298]:
body_df = combined_df[combined_df['body'].notna()].reset_index(drop=True)

In [483]:
body_df.value_counts('dataset')

dataset
Kaggle 1 - Fake News              44898
LIAR 2                            22962
Kaggle 2 - News Project            6335
Kaggle 3 - Fake News Detection     3988
Name: count, dtype: int64

In [299]:
def split_text_into_chunks(text, chunk_size=128):
    tokens = text.split()  # simple whitespace tokenization
    return [' '.join(tokens[i:i + chunk_size]) for i in range(0, len(tokens), chunk_size)]

# Create a new DataFrame with one chunk per row
def expand_chunks(df, text_column='body', chunk_size=128):
    rows = []
    for idx, row in df.iterrows():
        chunks = split_text_into_chunks(row[text_column], chunk_size)
        for i, chunk in enumerate(chunks):
            new_row = row.copy()
            new_row[text_column] = chunk
            new_row['chunk_id'] = i + 1
            rows.append(new_row)
    return pd.DataFrame(rows)

In [300]:
# Example usage
chunked_df = expand_chunks(body_df, text_column='body', chunk_size=64)

In [301]:
chunked_df['text'] = chunked_df['body']
chunked_df['text_clean'] = chunked_df['text'].apply(clean_text)
chunked_df['title'] = False

In [302]:
chunked_df = chunked_df[['dataset', 'text', 'text_clean', 'chunk_id', 'title', 'binary_label', 'label']]

In [303]:
chunked_df

Unnamed: 0,dataset,text,text_clean,chunk_id,title,binary_label,label
0,LIAR 2,"90 percent of Americans ""support universal bac...",90 percent of americans support universal back...,1,False,1,5.0
1,LIAR 2,Last year was one of the deadliest years ever ...,last year was one of the deadliest years ever ...,1,False,0,1.0
2,LIAR 2,"Bernie Sanders's plan is ""to raise your taxes ...",bernie sanderss plan is to raise your taxes to...,1,False,0,0.0
3,LIAR 2,Voter ID is supported by an overwhelming major...,voter id is supported by an overwhelming major...,1,False,1,4.0
4,LIAR 2,"Says Barack Obama ""robbed Medicare (of) $716 b...",says barack obama robbed medicare of 716 billi...,1,False,0,2.0
...,...,...,...,...,...,...,...
78182,Kaggle 3 - Fake News Detection,the game with the second lady. The game is spe...,the game with the second lady the game is spec...,6,False,0,
78182,Kaggle 3 - Fake News Detection,their racial grievances and hatred of Presiden...,their racial grievances and hatred of presiden...,7,False,0,
78182,Kaggle 3 - Fake News Detection,expressed the following via Twitter: 52m Vice ...,expressed the following via twitter 52m vice p...,8,False,0,
78182,Kaggle 3 - Fake News Detection,"our National Anthem. 11:08 AM – Oct 8, 2017 Pe...",our national anthem 1108 am oct 8 2017 pence w...,9,False,0,


In [311]:
combined_reshape_df = pd.concat(
    [
        title_df,
        chunked_df
    ], 
    ignore_index=True)

In [312]:
len(combined_reshape_df)

1261393

In [313]:
combined_reshape_df = combined_reshape_df.drop_duplicates()

In [314]:
len(combined_reshape_df)

1145518

In [315]:
combined_reshape_df['row_number'] = combined_reshape_df.groupby('dataset').cumcount()
combined_reshape_df['id'] = combined_reshape_df['dataset'] + '_' + combined_reshape_df['row_number'].astype(str)

In [317]:
combined_reshape_df = combined_reshape_df[['id', 'dataset', 'text', 'text_clean', 'chunk_id', 'title', 'binary_label', 'label']]

In [318]:
combined_reshape_df

Unnamed: 0,id,dataset,text,text_clean,chunk_id,title,binary_label,label
0,Fakeddit_0,Fakeddit,my walgreens offbrand mucinex was engraved wit...,my walgreens offbrand mucinex was engraved wit...,1,True,1,
1,Fakeddit_1,Fakeddit,this concerned sink with a tiny hat,this concerned sink with a tiny hat,1,True,0,
2,Fakeddit_2,Fakeddit,hackers leak emails from uae ambassador to us,hackers leak emails from uae ambassador to us,1,True,1,
3,Fakeddit_3,Fakeddit,this flower in my neighborhood,this flower in my neighborhood,1,True,1,
4,Fakeddit_4,Fakeddit,puppy taking in the view,puppy taking in the view,1,True,1,
...,...,...,...,...,...,...,...,...
1261388,Kaggle 3 - Fake News Detection_28758,Kaggle 3 - Fake News Detection,the game with the second lady. The game is spe...,the game with the second lady the game is spec...,6,False,0,
1261389,Kaggle 3 - Fake News Detection_28759,Kaggle 3 - Fake News Detection,their racial grievances and hatred of Presiden...,their racial grievances and hatred of presiden...,7,False,0,
1261390,Kaggle 3 - Fake News Detection_28760,Kaggle 3 - Fake News Detection,expressed the following via Twitter: 52m Vice ...,expressed the following via twitter 52m vice p...,8,False,0,
1261391,Kaggle 3 - Fake News Detection_28761,Kaggle 3 - Fake News Detection,"our National Anthem. 11:08 AM – Oct 8, 2017 Pe...",our national anthem 1108 am oct 8 2017 pence w...,9,False,0,


In [319]:
summary_df = (
    combined_reshape_df
    .groupby('dataset')
    .agg(
        num_samples = ('text_clean', 'count'),
        proportion_true = ('binary_label', lambda x: (x == 1).mean()),
        proportion_over_64 = ('text_clean', lambda x: (x.str.split().str.len() > 64).mean()),
        proportion_over_128 = ('text_clean', lambda x: (x.str.split().str.len() > 128).mean()),
        avg_string_length = ('text_clean', lambda x: x.str.split().str.len().mean()),
        proportion_title = ('title', lambda x: x.mean())
    )
    .reset_index()
)

summary_df

Unnamed: 0,dataset,num_samples,proportion_true,proportion_over_64,proportion_over_128,avg_string_length,proportion_title
0,Fakeddit,707590,0.538449,3.4e-05,4e-06,8.721854,1.0
1,Kaggle 1 - Fake News,300913,0.527079,3e-06,0.0,53.060778,0.128685
2,Kaggle 2 - News Project,85316,0.558336,0.000141,0.0,57.413897,0.073257
3,Kaggle 3 - Fake News Detection,28763,0.621528,0.000104,0.0,55.305636,0.098251
4,LIAR 2,22936,0.423396,0.0,0.0,17.608825,0.0


In [320]:
sample_size = 20000
sampled_reshape_df = combined_reshape_df.groupby('dataset').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True)

  sampled_reshape_df = combined_reshape_df.groupby('dataset').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True)


In [321]:
test_fraction = 0.2

In [322]:
sample_train_df, test_df = train_test_split(sampled_reshape_df, test_size=test_fraction, random_state=42, stratify=sampled_reshape_df[['dataset', 'binary_label']])

In [323]:
test_df['dataset'].value_counts()

dataset
LIAR 2                            4000
Kaggle 2 - News Project           4000
Kaggle 3 - Fake News Detection    4000
Fakeddit                          4000
Kaggle 1 - Fake News              4000
Name: count, dtype: int64

In [324]:
test_df['binary_label'].value_counts()

binary_label
1    10667
0     9333
Name: count, dtype: int64

In [325]:
combined_train_df = combined_reshape_df[~combined_reshape_df['id'].isin(test_df['id'])]


In [326]:
len(combined_reshape_df) - len(combined_train_df)

20000

In [327]:
combined_train_df

Unnamed: 0,id,dataset,text,text_clean,chunk_id,title,binary_label,label
0,Fakeddit_0,Fakeddit,my walgreens offbrand mucinex was engraved wit...,my walgreens offbrand mucinex was engraved wit...,1,True,1,
1,Fakeddit_1,Fakeddit,this concerned sink with a tiny hat,this concerned sink with a tiny hat,1,True,0,
2,Fakeddit_2,Fakeddit,hackers leak emails from uae ambassador to us,hackers leak emails from uae ambassador to us,1,True,1,
3,Fakeddit_3,Fakeddit,this flower in my neighborhood,this flower in my neighborhood,1,True,1,
4,Fakeddit_4,Fakeddit,puppy taking in the view,puppy taking in the view,1,True,1,
...,...,...,...,...,...,...,...,...
1261387,Kaggle 3 - Fake News Detection_28757,Kaggle 3 - Fake News Detection,around our Flag and everything that unites us....,around our flag and everything that unites us ...,5,False,0,
1261388,Kaggle 3 - Fake News Detection_28758,Kaggle 3 - Fake News Detection,the game with the second lady. The game is spe...,the game with the second lady the game is spec...,6,False,0,
1261389,Kaggle 3 - Fake News Detection_28759,Kaggle 3 - Fake News Detection,their racial grievances and hatred of Presiden...,their racial grievances and hatred of presiden...,7,False,0,
1261390,Kaggle 3 - Fake News Detection_28760,Kaggle 3 - Fake News Detection,expressed the following via Twitter: 52m Vice ...,expressed the following via twitter 52m vice p...,8,False,0,


In [328]:
combined_train_df.to_csv(file_path + "/Combined/combined_train_df_2.csv", index=False)
sample_train_df.to_csv(file_path + "/Combined/sample_train_df_2.csv", index=False)
test_df.to_csv(file_path + "/Combined/test_df_2.csv", index=False)

LLM Generated Datasets

LLM Generated New Dataset

Information: https://github.com/navid-aub/News-Dataset/tree/main

In [329]:
file_path_llm_news_dataset = file_path + '/LLM Generated News Dataset'

In [330]:
def load_llm_news_dataset_split(filename):
    path = os.path.join(file_path_llm_news_dataset, filename)
    df = pd.read_csv(path, header=0)  # Use default comma separator
    return df

In [337]:
llm_news_dataset_chatgpt_df = load_llm_news_dataset_split('chatgpt.csv')
llm_news_dataset_llama2_7b_df = load_llm_news_dataset_split('llama2_7b.csv')
llm_news_dataset_llama2_13b_df = load_llm_news_dataset_split('llama2_13b.csv')
llm_news_dataset_mistral_7b_df = load_llm_news_dataset_split('mistral_7b.csv')

In [338]:
llm_news_dataset_chatgpt_df['model'] = 'GPT3.5'
llm_news_dataset_llama2_7b_df['model'] = 'Llama2 7b'
llm_news_dataset_llama2_13b_df['model'] = 'Llama2 13b'
llm_news_dataset_mistral_7b_df['model'] = 'Mistral 7b'

In [339]:
llm_news_df = pd.concat(
    [
        llm_news_dataset_chatgpt_df,
        llm_news_dataset_llama2_7b_df, 
        llm_news_dataset_llama2_13b_df, 
        llm_news_dataset_mistral_7b_df
    ], 
    ignore_index=True)

In [343]:
llm_news_df

Unnamed: 0,agency,title,main,words,topic,rephrase,expanded,summary,summary_expanded,model
0,cbsnews.com,"Trump holds ""celebration of American flag"" in ...",Instead of holding a Super Bowl champion cerem...,368,sports,President Trump changed plans for the Super Bo...,"event, presumably coming from disappointed Eag...",President Trump rescinded the Philadelphia Eag...,"**""President Trump Hosts 'Celebration of Ameri...",GPT3.5
1,apnews.com,Pope observes usual Ash Wednesday customs in t...,VATICAN CITY (AP)  Pope Francis celebrated th...,688,"['history', 'religion']",VATICAN CITY (AP) – Pope Francis observed the ...,crisis that has affected many parts of the wor...,Headline: Pope Francis Leads Traditional Ash W...,**Pope Francis Leads Traditional Ash Wednesday...,GPT3.5
2,theguardian.com,Jim Molan likely to face challenge by moderate...,Conservative Jim Molans expected return to th...,380,"['politics', 'government']",Conservative Senator Jim Molan is anticipated ...,"Sorry, but I can't continue with the specific ...",The anticipated return of Conservative Jim Mol...,**Title: Battle Brews as Moderates Challenge C...,GPT3.5
3,theguardian.com,Scott Morrison sworn in as Australia's 30th pr...,"23 Aug 2018 17.52 EDT OK, this is significant....",1000,"['politics', 'government']","News Article Rephrased:\n\n**August 23, 2018 |...","Certainly, here's an extended and completed ve...","Date: August 23, 2018\n\nHeadlines:\n1. Libera...",**Ongoing Turmoil in Liberal Party: Leadership...,GPT3.5
4,reuters.com,"French clergy sexually abused over 200,000 chi...","Summary Investigation finds estimated 216,000 ...",857,"['history', 'religion']",Investigation Reveals Shocking Scale of Child ...,to investigate allegations of abuse within the...,A recent investigation into the French Catholi...,**Title: French Catholic Church Report Reveals...,GPT3.5
...,...,...,...,...,...,...,...,...,...,...
11995,aljazeera.com,Australia prison abuse could violate UN tortur...,The use of restraints and tear gas on juvenile...,423,"['social culture', 'civil rights']",Footage of Australian aboriginal children in ...,"""a very serious situation"" and that the use o...","A UN official, Juan Mendez, has expressed conc...",Title: UN Official Expressed Concerns over All...,Mistral 7b
11996,cbsnews.com,"Adylkuzz hack, called larger than WannaCry, sl...",Many computers and servers around the world wh...,756,"['science', 'information technology']","A security firm, Proofpoint, has revealed that...",other illicit goods. The surge in demand for ...,"A security company, Proofpoint, revealed that ...",Title: WannaCry Ransomware Attack Surprisingly...,Mistral 7b
11997,cbsnews.com,Brad Pitt and Angelina Jolie to wed this weekend?,"Brad Pitt, left, and Angelina Jolie arrive at ...",362,celebrity,(CBS News) speculation swirls around Brad Pit...,that wedding preparations have been in full s...,Reports have surfaced that Brad Pitt and Angel...,Title: Brad Pitt and Angelina Jolie's Upcoming...,Mistral 7b
11998,theguardian.com,Rag'n'Bone Man wins 2017 Brits critics' choice...,Sussex hip-hop bluesman RagnBone Man has bee...,375,celebrity,"Rag'n'Bone Man, the Sussex-based hip-hop blues...",has spent years honing his craft as a perform...,"Rory Graham, known as Rag'n'Bone Man, was name...",Title: Rag'n'Bone Man Wins Brits Critics Choic...,Mistral 7b


In [367]:
llm_news_long_df = llm_news_df.melt(
    id_vars=['model'],  # no ID columns specified, unless you have other columns you want to keep
    value_vars=['main', 'title', 'rephrase', 'expanded', 'summary', 'summary_expanded'],
    var_name='type', 
    value_name='text'
)

In [368]:
llm_news_long_df = llm_news_long_df[llm_news_long_df['text'].notna()]
llm_news_long_df = llm_news_long_df[llm_news_long_df['text'].apply(lambda x: isinstance(x, str))]

In [369]:
llm_news_long_df['type'].value_counts()

type
main                12000
title               12000
summary             11996
summary_expanded    11954
rephrase            11924
expanded            11838
Name: count, dtype: int64

In [376]:
# Create binary_label column
llm_news_long_df['binary_label'] = llm_news_long_df['type'].apply(
    lambda x: 1 if x in ['title', 'main'] else 0
)

llm_news_long_df['title'] = llm_news_long_df['type'].apply(
    lambda x: True if x in ['title'] else False
)

# Only set model to NaN where type is 'title' or 'main'
mask = llm_news_long_df['type'].isin(['title', 'main'])
llm_news_long_df.loc[mask, 'model'] = np.nan

llm_news_long_df['dataset'] = 'LLM News Dataset'

In [377]:
llm_news_long_df

Unnamed: 0,model,type,text,binary_label,title,dataset
0,,main,Instead of holding a Super Bowl champion cerem...,1,False,LLM News Dataset
1,,main,VATICAN CITY (AP)  Pope Francis celebrated th...,1,False,LLM News Dataset
2,,main,Conservative Jim Molans expected return to th...,1,False,LLM News Dataset
3,,main,"23 Aug 2018 17.52 EDT OK, this is significant....",1,False,LLM News Dataset
4,,main,"Summary Investigation finds estimated 216,000 ...",1,False,LLM News Dataset
...,...,...,...,...,...,...
71995,Mistral 7b,summary_expanded,Title: UN Official Expressed Concerns over All...,0,False,LLM News Dataset
71996,Mistral 7b,summary_expanded,Title: WannaCry Ransomware Attack Surprisingly...,0,False,LLM News Dataset
71997,Mistral 7b,summary_expanded,Title: Brad Pitt and Angelina Jolie's Upcoming...,0,False,LLM News Dataset
71998,Mistral 7b,summary_expanded,Title: Rag'n'Bone Man Wins Brits Critics Choic...,0,False,LLM News Dataset


In [385]:
llm_news_long_chunked_df = expand_chunks(llm_news_long_df, text_column='text', chunk_size=64)

In [386]:
llm_news_long_chunked_df['text_clean'] = llm_news_long_chunked_df['text'].apply(clean_text)

In [387]:
llm_news_long_chunked_df['row_number'] = llm_news_long_chunked_df.groupby('dataset').cumcount()
llm_news_long_chunked_df['id'] = llm_news_long_chunked_df['dataset'] + '_' + llm_news_long_chunked_df['row_number'].astype(str)

In [388]:
llm_news_long_chunked_df = llm_news_long_chunked_df[['id', 'dataset', 'model', 'type', 'text', 'text_clean', 'chunk_id', 'title', 'binary_label']]

In [389]:
llm_news_long_chunked_df

Unnamed: 0,id,dataset,model,type,text,text_clean,chunk_id,title,binary_label
0,LLM News Dataset_0,LLM News Dataset,,main,Instead of holding a Super Bowl champion cerem...,instead of holding a super bowl champion cerem...,1,False,1
0,LLM News Dataset_1,LLM News Dataset,,main,at the event. He disinvited the Eagles from th...,at the event he disinvited the eagles from the...,2,False,1
0,LLM News Dataset_2,LLM News Dataset,,main,was unclear if those jeers were directed at Mr...,was unclear if those jeers were directed at mr...,3,False,1
0,LLM News Dataset_3,LLM News Dataset,,main,"love our home, and our country has never done ...",love our home and our country has never done b...,4,False,1
0,LLM News Dataset_4,LLM News Dataset,,main,tells CBS News that fewer than 10 players comm...,tells cbs news that fewer than 10 players comm...,5,False,1
...,...,...,...,...,...,...,...,...,...
71999,LLM News Dataset_477804,LLM News Dataset,Mistral 7b,summary_expanded,pixie cut and began sporting edgier clothes an...,pixie cut and began sporting edgier clothes an...,3,False,0
71999,LLM News Dataset_477805,LLM News Dataset,Mistral 7b,summary_expanded,"been compared to rapper styles, has left some ...",been compared to rapper styles has left some f...,4,False,0
71999,LLM News Dataset_477806,LLM News Dataset,Mistral 7b,summary_expanded,"have been circulating the internet, have spark...",have been circulating the internet have sparke...,5,False,0
71999,LLM News Dataset_477807,LLM News Dataset,Mistral 7b,summary_expanded,way of her creativity and self-expression. Cyr...,way of her creativity and selfexpression cyrus...,6,False,0


In [390]:
llm_news_long_chunked_df['model'].value_counts()

model
GPT3.5        89109
Mistral 7b    84859
Llama2 7b     81687
Llama2 13b    81078
Name: count, dtype: int64

In [391]:
llm_news_long_chunked_df['type'].value_counts()

type
main                129076
expanded            108312
summary_expanded    101938
rephrase             80531
summary              45952
title                12000
Name: count, dtype: int64

In [485]:
llm_news_summary_df = (
    llm_news_long_chunked_df
    .groupby('dataset')
    .agg(
        num_samples = ('text_clean', 'count'),
        proportion_true = ('binary_label', lambda x: (x == 1).mean()),
        proportion_over_64 = ('text_clean', lambda x: (x.str.split().str.len() > 64).mean()),
        proportion_over_128 = ('text_clean', lambda x: (x.str.split().str.len() > 128).mean()),
        avg_string_length = ('text_clean', lambda x: x.str.split().str.len().mean()),
        proportion_title = ('title', lambda x: x.mean())
    )
    .reset_index()
)

llm_news_summary_df

Unnamed: 0,dataset,num_samples,proportion_true,proportion_over_64,proportion_over_128,avg_string_length,proportion_title
0,LLM News Dataset,477809,0.295256,4e-06,0.0,58.551557,0.025115


In [393]:
test_fraction = 0.2
llm_train_df, llm_test_df = train_test_split(llm_news_long_chunked_df, test_size=test_fraction, random_state=42, stratify=llm_news_long_chunked_df[['dataset', 'type']])

In [394]:
llm_test_df['model'].value_counts()

model
GPT3.5        17752
Mistral 7b    17062
Llama2 7b     16363
Llama2 13b    16170
Name: count, dtype: int64

In [395]:
llm_test_df['type'].value_counts()

type
main                25815
expanded            21663
summary_expanded    20388
rephrase            16106
summary              9190
title                2400
Name: count, dtype: int64

In [396]:
llm_test_df.groupby(['model', 'type']).size().reset_index(name='count')

Unnamed: 0,model,type,count
0,GPT3.5,expanded,5026
1,GPT3.5,rephrase,4896
2,GPT3.5,summary,2762
3,GPT3.5,summary_expanded,5068
4,Llama2 13b,expanded,5793
5,Llama2 13b,rephrase,3512
6,Llama2 13b,summary,1873
7,Llama2 13b,summary_expanded,4992
8,Llama2 7b,expanded,5555
9,Llama2 7b,rephrase,3392


In [397]:
llm_train_df.to_csv(file_path + "/Combined/llm_train_df.csv", index=False)
llm_test_df.to_csv(file_path + "/Combined/llm_test_df.csv", index=False)

MegaFake LLM Generated Dataset 

Information: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=5095309

In [399]:
file_path_megafake = file_path + '/MegaFake'

In [400]:
def load_megafake_dataset_split(filename):
    path = os.path.join(file_path_megafake, filename)

    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    df = pd.DataFrame.from_dict(data, orient='index').reset_index()
    df = df.rename(columns={'index': 'example_id'})  # Rename the key column if needed
    return df

In [430]:
megafake_style_based_fake = load_megafake_dataset_split('megafake-1_style_based_fake.json')
megafake_content_based_fake = load_megafake_dataset_split('megafake-2_content_based_fake.json')
megafake_integration_based_fake = load_megafake_dataset_split('megafake-3_integration_based_fake_tn200.json')
megafake_story_based_fake = load_megafake_dataset_split('megafake-4_story_based_fake.json')
megafake_style_based_legitimate = load_megafake_dataset_split('megafake-5_style_based_legitimate.json')
megafake_integration_based_legitimate = load_megafake_dataset_split('megafake-7_integration_based_legitimate_tn300.json')

In [459]:
megafake_integration_based_legitimate

Unnamed: 0,example_id,topic_id,topic_words,doc_1_id,doc_1_label,doc_1_text,doc_2_id,doc_2_label,doc_2_text,generated_label,generated_text_t01
0,megafake-916405_megafake-905417,0,"[work, movie, Awards, like, say, act, mother, ...",megafake-916405,legitimate,Tonya Harding says the Olympics have changed s...,megafake-905417,legitimate,Story highlights Margulies talks about life af...,legitimate,"Tonya Harding, the first American woman to suc..."
1,megafake-908189_megafake-905948,0,"[work, movie, Awards, like, say, act, mother, ...",megafake-908189,legitimate,Will Perry Wright live on? Alexander Skarsgar...,megafake-905948,legitimate,Diane Kruger talked about her career-defining ...,legitimate,In the first season of the hit HBO drama Big L...
2,megafake-902348_megafake-905156,0,"[work, movie, Awards, like, say, act, mother, ...",megafake-902348,legitimate,"Jane Fonda celebrated a big milestone that, th...",megafake-905156,legitimate,Tonya Harding threatened to walk out of an int...,legitimate,"In a recent interview with People, Academy Awa..."
3,megafake-910383_megafake-892738,0,"[work, movie, Awards, like, say, act, mother, ...",megafake-910383,legitimate,Alexander Skarsgård’s Big Little Lies characte...,megafake-892738,legitimate,If you spent a few years wondering when Michel...,legitimate,"Alexander Skarsgård, who played the menacing P..."
4,megafake-934497_megafake-907416,0,"[work, movie, Awards, like, say, act, mother, ...",megafake-934497,legitimate,Jane Fonda is busier than she’s ever been at 8...,megafake-907416,legitimate,Jane Fonda and Lily Tomlin appear on Bravo’s W...,legitimate,"In a recent interview with Ellen DeGeneres, Ja..."
...,...,...,...,...,...,...,...,...,...,...,...
5921,megafake-881465_megafake-908331,297,"[Reply, Thread, Link, , Parent, Richie, lik...",megafake-881465,legitimate,American model and media personality (born 199...,megafake-908331,legitimate,"OMG 20 YEARS? Fuck me. :[ I love this movie, e...",legitimate,Sofia Richie is an American social media perso...
5922,megafake-914648_megafake-845722,297,"[Reply, Thread, Link, , Parent, Richie, lik...",megafake-914648,legitimate,American model and media personality (born 199...,megafake-845722,legitimate,CBS’ Two-part true-crime series “The Case Of: ...,legitimate,Sofia Richie is an American social media perso...
5923,megafake-869136_megafake-861559,297,"[Reply, Thread, Link, , Parent, Richie, lik...",megafake-869136,legitimate,A photo of Nicolas Cage is sweeping the intern...,megafake-861559,legitimate,Here are 4 times humans actually competed agai...,legitimate,"Nicholas Cage, the Oscar-winning actor, has ca..."
5924,megafake-868810_megafake-929852,297,"[Reply, Thread, Link, , Parent, Richie, lik...",megafake-868810,legitimate,Here are 4 times humans actually competed agai...,megafake-929852,legitimate,Fleetwood Mac Drags Lindsey Buckingham After O...,legitimate,"In a highly anticipated event, Discovery Chann..."


In [468]:
def megafake_raw_data_reshape(df, generation_technique, fake, text_column = "generated_text"):

    output_df = df.copy()

    output_df['dataset'] = 'MegaFake'
    output_df['generation_technique'] = generation_technique
    output_df['title'] = False

    if fake == True:
        output_df['binary_label'] = 0
    else:
        output_df['binary_label'] = 1

    output_df.rename(columns={text_column: 'text'}, inplace=True)

    output_df = output_df[['dataset', 'generation_technique', 'text', 'title', 'binary_label']]

    return output_df

In [469]:
megafake_style_based_fake_reshape = megafake_raw_data_reshape(megafake_style_based_fake, 'style based', True, "generated_text")
megafake_content_based_fake_reshape = megafake_raw_data_reshape(megafake_content_based_fake, 'content based', True, "generated_text_glm4")
megafake_integration_based_fake_reshape = megafake_raw_data_reshape(megafake_integration_based_fake, 'integration based', True, "generated_text")
megafake_story_based_fake_reshape = megafake_raw_data_reshape(megafake_story_based_fake, 'story based', True, "generated_text")
megafake_style_based_legitimate_reshape = megafake_raw_data_reshape(megafake_style_based_legitimate, 'style based', False, "generated_text_t015")
megafake_integration_based_legitimate_reshape = megafake_raw_data_reshape(megafake_integration_based_legitimate, 'integration based', False, "generated_text_t01")

In [484]:
len(megafake_style_based_fake_reshape) + len(megafake_content_based_fake_reshape) + len(megafake_integration_based_fake_reshape) + len(megafake_story_based_fake_reshape)

45788

In [470]:
megafake_integration_based_legitimate_reshape

Unnamed: 0,dataset,generation_technique,text,title,binary_label
0,MegaFake,integration based,"Tonya Harding, the first American woman to suc...",False,1
1,MegaFake,integration based,In the first season of the hit HBO drama Big L...,False,1
2,MegaFake,integration based,"In a recent interview with People, Academy Awa...",False,1
3,MegaFake,integration based,"Alexander Skarsgård, who played the menacing P...",False,1
4,MegaFake,integration based,"In a recent interview with Ellen DeGeneres, Ja...",False,1
...,...,...,...,...,...
5921,MegaFake,integration based,Sofia Richie is an American social media perso...,False,1
5922,MegaFake,integration based,Sofia Richie is an American social media perso...,False,1
5923,MegaFake,integration based,"Nicholas Cage, the Oscar-winning actor, has ca...",False,1
5924,MegaFake,integration based,"In a highly anticipated event, Discovery Chann...",False,1


In [487]:
megafake_df = pd.concat(
    [
        megafake_style_based_fake_reshape,
        megafake_content_based_fake_reshape, 
        megafake_integration_based_fake_reshape, 
        megafake_story_based_fake_reshape, 
        #megafake_style_based_legitimate_reshape,
        #megafake_integration_based_legitimate_reshape
    ], 
    ignore_index=True)

In [488]:
megafake_df = megafake_df[megafake_df['text'].notna()]
megafake_df = megafake_df[megafake_df['text'].apply(lambda x: isinstance(x, str))]

megafake_df = expand_chunks(megafake_df, text_column='text', chunk_size=64)
megafake_df['text_clean'] = megafake_df['text'].apply(clean_text)

megafake_df['row_number'] = megafake_df.groupby('dataset').cumcount()
megafake_df['id'] = megafake_df['dataset'] + '_' + megafake_df['row_number'].astype(str)

megafake_df = megafake_df[['id', 'dataset', 'generation_technique', 'text', 'text_clean', 'chunk_id', 'title', 'binary_label']]

In [489]:
megafake_df

Unnamed: 0,id,dataset,generation_technique,text,text_clean,chunk_id,title,binary_label
0,MegaFake_0,MegaFake,style based,"According to recent reports, Miley Cyrus and L...",according to recent reports miley cyrus and li...,1,False,0
0,MegaFake_1,MegaFake,style based,"the property. The source of the report, who re...",the property the source of the report who requ...,2,False,0
0,MegaFake_2,MegaFake,style based,"at the ceremony, along with siblings Noah, Tra...",at the ceremony along with siblings noah trace...,3,False,0
0,MegaFake_3,MegaFake,style based,to have a baby.,to have a baby,4,False,0
1,MegaFake_4,MegaFake,style based,Paris Jackson and Cara Delevingne were seen to...,paris jackson and cara delevingne were seen to...,1,False,0
...,...,...,...,...,...,...,...,...
45786,MegaFake_203525,MegaFake,story based,help and that she should be surrounded by peop...,help and that she should be surrounded by peop...,6,False,0
45787,MegaFake_203526,MegaFake,story based,"Kylie Jenner, the 24-year-old reality TV star ...",kylie jenner the 24yearold reality tv star and...,1,False,0
45787,MegaFake_203527,MegaFake,story based,butterflies has been a source of concern for h...,butterflies has been a source of concern for h...,2,False,0
45787,MegaFake_203528,MegaFake,story based,and confidence. She has become one of the most...,and confidence she has become one of the most ...,3,False,0


In [474]:
megafake_df.value_counts('generation_technique')

generation_technique
style based          111414
content based         66979
story based           55173
integration based     36527
Name: count, dtype: int64

In [475]:
megafake_df.groupby(['generation_technique', 'binary_label']).size().reset_index(name='count')

Unnamed: 0,generation_technique,binary_label,count
0,content based,0,66979
1,integration based,0,12550
2,integration based,1,23977
3,story based,0,55173
4,style based,0,68828
5,style based,1,42586


In [490]:
megafake_summary_df = (
    megafake_df
    .groupby('dataset')
    .agg(
        num_samples = ('text_clean', 'count'),
        proportion_true = ('binary_label', lambda x: (x == 1).mean()),
        proportion_over_64 = ('text_clean', lambda x: (x.str.split().str.len() > 64).mean()),
        proportion_over_128 = ('text_clean', lambda x: (x.str.split().str.len() > 128).mean()),
        avg_string_length = ('text_clean', lambda x: x.str.split().str.len().mean()),
        proportion_title = ('title', lambda x: x.mean())
    )
    .reset_index()
)

megafake_summary_df

Unnamed: 0,dataset,num_samples,proportion_true,proportion_over_64,proportion_over_128,avg_string_length,proportion_title
0,MegaFake,203530,0.0,5e-06,0.0,56.744721,0.0


In [476]:
test_fraction = 0.2
megafake_train_df, megafake_test_df = train_test_split(megafake_df, test_size=test_fraction, random_state=42, stratify=megafake_df[['generation_technique', 'binary_label']])

In [480]:
megafake_test_df.value_counts('generation_technique')

generation_technique
style based          22283
content based        13396
story based          11035
integration based     7305
Name: count, dtype: int64

In [481]:
megafake_test_df.groupby(['generation_technique', 'binary_label']).size().reset_index(name='count')

Unnamed: 0,generation_technique,binary_label,count
0,content based,0,13396
1,integration based,0,2510
2,integration based,1,4795
3,story based,0,11035
4,style based,0,13766
5,style based,1,8517


In [482]:
megafake_test_df.to_csv(file_path + "/Combined/megafake_test_df.csv", index=False)
megafake_train_df.to_csv(file_path + "/Combined/megafake_train_df.csv", index=False)