# Exploratory Data Analysis (EDA)

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set(style="whitegrid")

## 2. Load Dataset

In [None]:
try:
    df = pd.read_csv('data/creditcard.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Dataset not found. Please run download_data.py first.")

## 3. Data Overview

In [None]:
# Display first 5 rows
df.head()

In [None]:
# Dataset info (columns, data types, non-null counts)
df.info()

In [None]:
# Summary statistics
df.describe()

## 4. Check for Missing Values

In [None]:
missing_values = df.isnull().sum().max()
print(f"Max missing values in any column: {missing_values}")

## 5. Class Imbalance (Fraud vs Normal)
The dataset is known to be highly unbalanced. Let's visualize it.

In [None]:
class_counts = df['Class'].value_counts()
print("Class distribution:\n", class_counts)

plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Normal, 1: Fraud)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.yscale('log') # Use log scale because imbalance is huge
plt.show()

## 6. Distribution of Transaction Amount and Time

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18, 4))

amount_val = df['Amount'].values
time_val = df['Time'].values

sns.histplot(amount_val, ax=ax[0], color='r', bins=50, kde=True)
ax[0].set_title('Distribution of Transaction Amount')
ax[0].set_xlim([min(amount_val), 20000])

sns.histplot(time_val, ax=ax[1], color='b', bins=50, kde=True)
ax[1].set_title('Distribution of Transaction Time')

plt.show()