# EDA Visuals — Fraud Detection Dataset

Load 10,000 rows and produce visual summaries. Minimal comments.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime

os.makedirs('/mnt/data/eda_figs', exist_ok=True)

csv_path = '/kaggle/input/fraud-detection/fraudTest.csv'
try:
    df = pd.read_csv(csv_path, nrows=10000)
except Exception as e:
    df = pd.read_csv('/mnt/data/fraudTest.csv', nrows=10000)

df_original = df.copy()
df.shape


In [None]:

# Parse datetime column variants
datetime_cols = [c for c in df.columns if 'trans_date' in c or 'date' in c and 'trans' in c]
dt_col = None
for c in df.columns:
    if 'trans_date' in c or 'trans_date_trans_time' in c or 'transaction_date' in c:
        dt_col = c
        break

if dt_col is None:
    for c in ['trans_date_trans_time','transaction_date','date']:
        if c in df.columns:
            dt_col = c
            break

if dt_col and not np.issubdtype(df[dt_col].dtype, np.datetime64):
    try:
        df[dt_col] = pd.to_datetime(df[dt_col])
    except:
        pass

if dt_col and np.issubdtype(df[dt_col].dtype, np.datetime64):
    df['trans_dt'] = df[dt_col]
    df['trans_hour'] = df['trans_dt'].dt.hour
    df['trans_date'] = df['trans_dt'].dt.date

amt_col = None
for c in ['amt','amount','trans_amount','transaction_amount']:
    if c in df.columns:
        amt_col = c
        break

if amt_col:
    df['amt'] = pd.to_numeric(df[amt_col], errors='coerce')
else:
    df['amt'] = np.nan

label_col = None
for c in ['is_fraud','isFraud','fraud','label']:
    if c in df.columns:
        label_col = c
        break

if label_col:
    df['is_fraud'] = pd.to_numeric(df[label_col], errors='coerce').fillna(0).astype(int)
else:
    df['is_fraud'] = 0

def has(col):
    return col in df.columns

print('rows, cols:', df.shape)
print('datetime col:', dt_col)
print('amount col used:', amt_col)
print('label col used:', label_col)


In [None]:

missing = df.isnull().astype(int)
fig, ax = plt.subplots(figsize=(10,6))
ax.imshow(missing.T, aspect='auto', interpolation='nearest')
ax.set_xlabel('Row index (sample)')
ax.set_yticks(range(len(missing.columns)))
ax.set_yticklabels(missing.columns)
ax.set_title('Missing values (1 = missing)')
plt.tight_layout()
plt.savefig('/mnt/data/eda_figs/missing_matrix.png', dpi=150)
plt.close(fig)
'/mnt/data/eda_figs/missing_matrix.png'


In [None]:

counts = df['is_fraud'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(5,4))
ax.bar(['Non-Fraud','Fraud'], [counts.get(0,0), counts.get(1,0)])
ax.set_ylabel('Count')
ax.set_title('Fraud vs Non-Fraud Count')
for i,v in enumerate([counts.get(0,0), counts.get(1,0)]):
    ax.text(i, v*1.01, str(int(v)), ha='center')
plt.tight_layout()
plt.savefig('/mnt/data/eda_figs/class_balance.png', dpi=150)
plt.close(fig)
'/mnt/data/eda_figs/class_balance.png'


In [None]:

if not df['amt'].isna().all():
    fig, ax = plt.subplots(figsize=(7,4))
    df['amt'].dropna().plot.hist(bins=50, ax=ax)
    ax.set_title('Transaction Amount Distribution (linear)')
    ax.set_xlabel('Amount')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/amt_hist_linear.png', dpi=150)
    plt.close(fig)

    fig, ax = plt.subplots(figsize=(7,4))
    ser = df['amt'].dropna()
    ser_log = np.log1p(ser.clip(lower=0))
    ax.hist(ser_log, bins=50)
    ax.set_title('Transaction Amount Distribution (log1p)')
    ax.set_xlabel('log1p(Amount)')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/amt_hist_log.png', dpi=150)
    plt.close(fig)

'/mnt/data/eda_figs/amt_hist_linear.png'


In [None]:

if not df['amt'].isna().all():
    fig, ax = plt.subplots(figsize=(6,4))
    groups = [df.loc[df['is_fraud']==lab,'amt'].dropna() for lab in [0,1]]
    ax.boxplot(groups, labels=['Non-Fraud','Fraud'], showfliers=False)
    ax.set_ylabel('Amount')
    ax.set_title('Amount by Fraud Label (no outliers shown)')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/amt_by_label_box.png', dpi=150)
    plt.close(fig)
'/mnt/data/eda_figs/amt_by_label_box.png'


In [None]:

if not df['amt'].isna().all():
    fig, ax = plt.subplots(figsize=(7,4))
    for lab in [0,1]:
        s = df.loc[df['is_fraud']==lab,'amt'].dropna()
        if len(s)>0:
            s = s.sample(min(len(s),2000), random_state=1)
            s.plot(kind='kde', ax=ax, label=f'label={lab}')
    ax.set_title('Amount Density by Label (sampled)')
    ax.legend()
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/amt_density_by_label.png', dpi=150)
    plt.close(fig)
'/mnt/data/eda_figs/amt_density_by_label.png'


In [None]:

if 'trans_hour' in df.columns:
    fig, ax = plt.subplots(figsize=(8,4))
    df['trans_hour'].dropna().astype(int).plot.hist(bins=24, ax=ax)
    ax.set_title('Transactions by Hour (sample)')
    ax.set_xlabel('Hour of day')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/transactions_by_hour.png', dpi=150)
    plt.close(fig)

    hr = df.groupby('trans_hour')['is_fraud'].mean().reindex(range(24)).fillna(0)
    fig, ax = plt.subplots(figsize=(8,4))
    ax.plot(hr.index, hr.values, marker='o')
    ax.set_xlabel('Hour')
    ax.set_ylabel('Fraud rate (mean)')
    ax.set_title('Fraud rate by Hour of Day')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/fraud_rate_by_hour.png', dpi=150)
    plt.close(fig)
'/mnt/data/eda_figs/transactions_by_hour.png'


In [None]:

num = df.select_dtypes(include=[np.number]).copy()
if num.shape[1] >= 2:
    corr = num.corr()
    fig, ax = plt.subplots(figsize=(8,6))
    im = ax.imshow(corr, aspect='auto', interpolation='nearest')
    ax.set_xticks(range(len(corr.columns)))
    ax.set_yticks(range(len(corr.columns)))
    ax.set_xticklabels(corr.columns, rotation=90)
    ax.set_yticklabels(corr.columns)
    plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    ax.set_title('Correlation matrix (numeric)')
    plt.tight_layout()
    plt.savefig('/mnt/data/eda_figs/corr_matrix_numeric.png', dpi=150)
    plt.close(fig)
'/mnt/data/eda_figs/corr_matrix_numeric.png'


In [None]:

candidates = ['category','city','state','merchant','job']
for c in candidates:
    if c in df.columns:
        vc = df[c].value_counts().nlargest(20)
        fig, ax = plt.subplots(figsize=(8,4))
        ax.barh(vc.index[::-1], vc.values[::-1])
        ax.set_title(f'Top 20 {c} (by transaction count)')
        plt.tight_layout()
        safe = c.replace('/','_')
        plt.savefig(f'/mnt/data/eda_figs/top20_{safe}.png', dpi=150)
        plt.close(fig)
'/mnt/data/eda_figs/'


In [None]:

for c in ['category','city','state','merchant']:
    if c in df.columns:
        top = df[c].value_counts().nlargest(10).index.tolist()
        rate = df[df[c].isin(top)].groupby(c)['is_fraud'].mean().sort_values(ascending=False)
        fig, ax = plt.subplots(figsize=(8,4))
        ax.bar(rate.index.astype(str), rate.values)
        ax.set_xticklabels(rate.index.astype(str), rotation=45, ha='right')
        ax.set_title(f'Fraud rate for top {len(rate)} {c}')
        plt.tight_layout()
        plt.savefig(f'/mnt/data/eda_figs/fraud_rate_top_{c}.png', dpi=150)
        plt.close(fig)
'/mnt/data/eda_figs/'


In [None]:

proc_path = '/mnt/data/eda_sample_10000.csv'
df.to_csv(proc_path, index=False)
proc_path
