# TruthSeeker Misinformation EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path

pd.set_option('display.max_columns', None)
sns.set_theme(style="whitegrid", palette="crest")
plt.rcParams['figure.figsize'] = (8, 5)

In [None]:
# Update this path if the dataset is stored elsewhere
DATA_PATH = Path('data/truthseeker.csv')

if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"Could not find the TruthSeeker dataset at {DATA_PATH}. "
        "Update `DATA_PATH` to point to the CSV file provided by the research team."
    )

# Read the dataset
raw_df = pd.read_csv(DATA_PATH)
print(f"Loaded {DATA_PATH} with shape: {raw_df.shape}")
raw_df.head()

## Initial Overview

In [None]:
df = raw_df.copy()

print("--- Dataset Info ---")
df.info()

In [None]:
df.describe(include=[np.number])

In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
if not missing.empty:
    ax = missing.to_frame('missing_count').plot(kind='bar')
    ax.set_title('Missing Values per Column')
    ax.set_ylabel('Count of Missing Values')
    ax.set_xlabel('Feature')
    plt.tight_layout()
    plt.show()
missing

In [None]:
duplicates = df.duplicated().sum(); print(f"Duplicate rows: {duplicates}")

## Label Distributions

In [None]:
target_candidates = ['BinaryNumTarget', 'is_fake', 'label', 'target']
bot_candidates = ['BotScoreBinary', 'is_bot', 'bot_label']

label_cols = {
    'target': next((c for c in target_candidates if c in df.columns), None),
    'bot': next((c for c in bot_candidates if c in df.columns), None)
}
label_cols

In [None]:
target_col = next((c for c in ['BinaryNumTarget', 'is_fake', 'label', 'target'] if c in df.columns), None)
bot_col = next((c for c in ['BotScoreBinary', 'is_bot', 'bot_label'] if c in df.columns), None)

if target_col:
    display(df[target_col].value_counts().rename('count'))
    sns.countplot(x=target_col, data=df)
    plt.title('Distribution of Misinformation Labels')
    plt.tight_layout()
    plt.show()

if bot_col:
    display(df[bot_col].value_counts().rename('count'))
    sns.countplot(x=bot_col, data=df)
    plt.title('Distribution of Bot vs Human Accounts')
    plt.tight_layout()
    plt.show()

if target_col and bot_col:
    cross = pd.crosstab(df[target_col], df[bot_col])
    display(cross)
    sns.heatmap(cross, annot=True, fmt='d', cmap='flare')
    plt.title('Truth vs Bot Crosstab (Counts)')
    plt.tight_layout()
    plt.show()

    cross_norm = pd.crosstab(df[target_col], df[bot_col], normalize='index')
    display((cross_norm * 100).round(2))
    sns.heatmap(cross_norm, annot=True, fmt='.2f', cmap='crest')
    plt.title('Truth vs Bot Crosstab (Row %)')
    plt.tight_layout()
    plt.show()

## Feature Distributions

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col and numeric_cols:
    melted = df[numeric_cols + [target_col]].melt(id_vars=target_col, value_vars=numeric_cols)
    g = sns.FacetGrid(melted, col='variable', col_wrap=3, sharex=False, sharey=False, hue=target_col)
    g.map_dataframe(sns.kdeplot, x='value', fill=True, common_norm=False, alpha=0.5)
    g.add_legend(title=target_col)
    g.fig.subplots_adjust(top=0.9)
    g.fig.suptitle('Numeric Feature Distributions by Misinformation Label')

if bot_col and numeric_cols:
    melted = df[numeric_cols + [bot_col]].melt(id_vars=bot_col, value_vars=numeric_cols)
    g = sns.FacetGrid(melted, col='variable', col_wrap=3, sharex=False, sharey=False, hue=bot_col)
    g.map_dataframe(sns.kdeplot, x='value', fill=True, common_norm=False, alpha=0.5)
    g.add_legend(title=bot_col)
    g.fig.subplots_adjust(top=0.9)
    g.fig.suptitle('Numeric Feature Distributions by Bot Label')

## Text-Based Signals

In [None]:
text_col = next((c for c in ['tweet', 'text', 'content'] if c in df.columns), None)
if text_col:
    df['tweet_word_count'] = df[text_col].astype(str).str.split().str.len()
    df['tweet_char_count'] = df[text_col].astype(str).str.len()

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    if bot_col:
        sns.boxplot(data=df, x=bot_col, y='tweet_word_count', ax=axes[0])
    else:
        sns.boxplot(data=df, y='tweet_word_count', ax=axes[0])
    axes[0].set_title('Word Count by Bot Label' if bot_col else 'Tweet Word Count')
    axes[0].set_xlabel(bot_col if bot_col else '')

    if target_col:
        sns.boxplot(data=df, x=target_col, y='tweet_word_count', ax=axes[1])
    else:
        sns.boxplot(data=df, y='tweet_word_count', ax=axes[1])
    axes[1].set_title('Word Count by Truth Label' if target_col else 'Tweet Word Count')
    axes[1].set_xlabel(target_col if target_col else '')
    plt.tight_layout()
    plt.show()

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    if bot_col:
        sns.kdeplot(data=df, x='tweet_char_count', hue=bot_col, fill=True, alpha=0.4, ax=axes[0])
    else:
        sns.kdeplot(df['tweet_char_count'], fill=True, ax=axes[0])
    axes[0].set_title('Character Count Distribution by Bot Label' if bot_col else 'Character Count Distribution')

    if target_col:
        sns.kdeplot(data=df, x='tweet_char_count', hue=target_col, fill=True, alpha=0.4, ax=axes[1])
    else:
        sns.kdeplot(df['tweet_char_count'], fill=True, ax=axes[1])
    axes[1].set_title('Character Count Distribution by Truth Label' if target_col else 'Character Count Distribution')
    plt.tight_layout()
    plt.show()

In [None]:
if text_col:
    def extract_items(pattern):
        exploded = df[text_col].astype(str).str.lower().str.findall(pattern)
        counts = exploded.explode().value_counts().head(15)
        return counts

    hashtags = extract_items(r'#\w+')
    mentions = extract_items(r'@\w+')

    if not hashtags.empty:
        sns.barplot(x=hashtags.values, y=hashtags.index)
        plt.title('Top Hashtags')
        plt.xlabel('Frequency')
        plt.tight_layout()
        plt.show()

    if not mentions.empty:
        sns.barplot(x=mentions.values, y=mentions.index)
        plt.title('Top Mentions')
        plt.xlabel('Frequency')
        plt.tight_layout()
        plt.show()

## Bot vs Human Behaviour

In [None]:
if bot_col:
    group_stats = df.groupby(bot_col).agg(['mean', 'median', 'std'])
    display(group_stats)

    if target_col:
        bot_target = pd.crosstab(df[bot_col], df[target_col], normalize='index') * 100
        display(bot_target.round(2))

    if text_col:
        sns.violinplot(data=df, x=bot_col, y='tweet_word_count')
        plt.title('Tweet Word Count Distribution by Bot Label')
        plt.tight_layout()
        plt.show()

## Temporal Dynamics

In [None]:
date_col = next((c for c in ['created_at', 'timestamp', 'date'] if c in df.columns), None)
if date_col:
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    timeline = df.dropna(subset=[date_col]).copy()
    if not timeline.empty:
        timeline['day'] = timeline[date_col].dt.date
        daily_counts = timeline.groupby('day').size()
        daily_counts.plot()
        plt.title('Volume of Posts Over Time')
        plt.ylabel('Number of Posts')
        plt.tight_layout()
        plt.show()

        if target_col:
            daily_target = timeline.groupby(['day', target_col]).size().unstack(fill_value=0)
            daily_target.rolling(window=7, min_periods=1).mean().plot()
            plt.title('7-Day Rolling Average of Labels Over Time')
            plt.ylabel('Posts (7-day avg)')
            plt.tight_layout()
            plt.show()

        if bot_col:
            daily_bot = timeline.groupby(['day', bot_col]).size().unstack(fill_value=0)
            daily_bot.rolling(window=7, min_periods=1).mean().plot()
            plt.title('7-Day Rolling Average of Bot Activity')
            plt.ylabel('Posts (7-day avg)')
            plt.tight_layout()
            plt.show()

## Correlation Analysis

In [None]:
if df.select_dtypes(include=[np.number]).shape[1] > 1:
    corr = df.select_dtypes(include=[np.number]).corr()
    sns.heatmap(corr, cmap='vlag', center=0, annot=False)
    plt.title('Correlation Heatmap of Numeric Features')
    plt.tight_layout()
    plt.show()
    corr

## Save Processed Data (Optional)

In [None]:
OUTPUT_PATH = Path('data/truthseeker_processed.csv')
df.to_csv(OUTPUT_PATH, index=False)
print(f"Processed data saved to {OUTPUT_PATH.resolve()}")