# Exploratory Data Analysis (EDA) - Crypto Transaction Dataset
This notebook performs EDA on the crypto transaction dataset to understand its structure, distributions, and potential features for fraud detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

In [None]:
# Load the dataset
df = pd.read_csv('transaction_dataset.csv')
print(f"Dataset shape: {df.shape}")

In [None]:
# Initial inspection
df.head()

In [None]:
# Basic info and data types
df.info()

In [None]:
# Missing values analysis
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if not missing_values.empty:
    print("Columns with missing values:")
    print(missing_values)
else:
    print("No missing values found.")

In [None]:
# Target variable distribution (FLAG)
plt.figure(figsize=(8, 5))
sns.countplot(x='FLAG', data=df, palette='viridis')
plt.title('Distribution of Target Variable (FLAG)')
plt.xlabel('Flag (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.show()

print("Target Class Distribution:")
print(df['FLAG'].value_counts(normalize=True))

In [None]:
# Numerical summary statistics
df.describe().T

In [None]:
# Distribution of key features
features_to_plot = ['Avg min between sent tnx', 'Avg min between received tnx', 'Total Ether sent', 'total ether received']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[feature], kde=True, bins=50, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.yscale('log')  # Distribution might be skewed

plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap (Numerical features)
plt.figure(figsize=(16, 12))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
# Most correlated features with FLAG
flag_corr = corr['FLAG'].sort_values(ascending=False)
print("Top Positive Correlations with FLAG:")
print(flag_corr.head(10))
print("\nTop Negative Correlations with FLAG:")
print(flag_corr.tail(10))

## Data Cleaning & Feature Selection Recommendations

Based on the analysis above, several issues have been identified that must be addressed before model training:

1.  **Data Leakage**: The columns `Unnamed: 0` and `Index` show very high correlation with the target `FLAG`. These are incremental ID columns that were likely sorted by the target during dataset creation. Including them would lead to an over-optimistic but useless model.
2.  **Zero-Variance Features**: Several ERC20 columns (e.g., `ERC20 avg time between sent tnx`, `ERC20 avg time between rec tnx`, etc.) have only one unique value (0.0). These provide no predictive power and should be removed.
3.  **Address Identification**: The `Address` column is likely unique per row and should be dropped to avoid the model memorizing specific addresses.
4.  **Missing Values**: 829 rows have missing values across most ERC20 columns. These rows likely represent wallets with no ERC20 activity. Filling these with `0` is a reasonable strategy.

In [None]:
# Identifying features to drop
leakage_features = ['Unnamed: 0', 'Index', 'Address']

# Identifying constant (zero-variance) features
constant_features = [col for col in df.select_dtypes(include=[np.number]).columns if df[col].std() == 0]

print(f"Leakage features to drop: {leakage_features}")
print(f"Constant features to drop: {constant_features}")

# Recommendation: Drop these features
# df_cleaned = df.drop(columns=leakage_features + constant_features)
# print(f'New shape after dropping: {df_cleaned.shape}')