# DeFi Credit Scoring - Exploratory Data Analysis

This notebook provides exploratory analysis of Aave V2 transaction data and demonstrates the credit scoring system.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

from data_processor import DataProcessor
from feature_engineer import FeatureEngineer
from model_trainer import ModelTrainer
from scorer import WalletScorer
from analyzer import ScoreAnalyzer

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load and Process Sample Data

In [None]:
# Load sample transaction data
with open('../data/sample_transactions.json', 'r') as f:
    transactions_data = json.load(f)

print(f"Loaded {len(transactions_data)} sample transactions")
print("\nSample transaction:")
print(json.dumps(transactions_data[0], indent=2))

In [None]:
# Process transactions
processor = DataProcessor()
processed_df = processor.process_transactions(transactions_data)

print(f"Processed data shape: {processed_df.shape}")
print("\nProcessed data columns:")
print(processed_df.columns.tolist())
print("\nFirst few rows:")
processed_df.head()

## 2. Feature Engineering

In [None]:
# Engineer features
engineer = FeatureEngineer()
features_df = engineer.engineer_features(processed_df)

print(f"Features shape: {features_df.shape}")
print(f"Number of features: {len(features_df.columns) - 1}")
print("\nSample features:")
features_df.head()

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

key_features = ['total_transactions', 'repayment_ratio', 'leverage_ratio', 
                'liquidation_count', 'unique_assets', 'account_age_days']

for i, feature in enumerate(key_features):
    if feature in features_df.columns:
        features_df[feature].hist(bins=20, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Model Training and Scoring

In [None]:
# Train model
trainer = ModelTrainer()
model_package = trainer.train_model(features_df)

print("Model trained successfully!")
print(f"Model type: {type(model_package['model']).__name__}")

In [None]:
# Score wallets
scorer = WalletScorer(model_package)
scores = scorer.score_wallets(features_df)

print(f"Scored {len(scores)} wallets")
print("\nScore distribution:")
print(scores['credit_score'].describe())
print("\nRisk category distribution:")
print(scores['risk_category'].value_counts())

## 4. Score Analysis and Visualization

In [None]:
# Visualize score distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Score histogram
scores['credit_score'].hist(bins=20, ax=axes[0], alpha=0.7, color='skyblue')
axes[0].set_title('Credit Score Distribution')
axes[0].set_xlabel('Credit Score')
axes[0].set_ylabel('Frequency')
axes[0].axvline(scores['credit_score'].mean(), color='red', linestyle='--', label=f'Mean: {scores["credit_score"].mean():.0f}')
axes[0].legend()

# Risk category pie chart
risk_counts = scores['risk_category'].value_counts()
axes[1].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Risk Category Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Generate comprehensive analysis
analyzer = ScoreAnalyzer()
analysis = analyzer.generate_comprehensive_analysis(scores, features_df)

print("Analysis Summary:")
print(f"Total wallets: {analysis['score_distribution']['total_wallets']}")
print(f"Mean score: {analysis['score_distribution']['mean_score']:.2f}")
print(f"Score range: {analysis['score_distribution']['min_score']:.0f} - {analysis['score_distribution']['max_score']:.0f}")

print("\nBucket Distribution:")
for bucket, data in analysis['score_distribution']['bucket_distribution'].items():
    print(f"  {bucket}: {data['count']} wallets ({data['percentage']:.1f}%)")

In [None]:
# Score bucket visualization
bucket_data = analysis['score_distribution']['bucket_distribution']
buckets = list(bucket_data.keys())
counts = [bucket_data[bucket]['count'] for bucket in buckets]
percentages = [bucket_data[bucket]['percentage'] for bucket in buckets]

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Count bar chart
axes[0].bar(buckets, counts, alpha=0.7, color='lightcoral')
axes[0].set_title('Wallet Count by Score Bucket')
axes[0].set_xlabel('Score Bucket')
axes[0].set_ylabel('Number of Wallets')
axes[0].tick_params(axis='x', rotation=45)

# Percentage bar chart
axes[1].bar(buckets, percentages, alpha=0.7, color='lightgreen')
axes[1].set_title('Percentage Distribution by Score Bucket')
axes[1].set_xlabel('Score Bucket')
axes[1].set_ylabel('Percentage (%)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Behavioral Pattern Analysis

In [None]:
# Merge scores with features for analysis
merged_df = pd.merge(scores, features_df, on='wallet_address')

# Correlation matrix of key features with credit score
key_features = ['credit_score', 'total_transactions', 'repayment_ratio', 'leverage_ratio', 
                'liquidation_count', 'unique_assets', 'account_age_days']

correlation_features = [f for f in key_features if f in merged_df.columns]
correlation_matrix = merged_df[correlation_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance (if available)
if hasattr(model_package['model'], 'feature_importances_'):
    # Get feature names (limited by feature selector)
    n_features = len(model_package['model'].feature_importances_)
    feature_names = [f'Feature_{i}' for i in range(n_features)]
    
    # Create feature importance plot
    importances = model_package['model'].feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title('Feature Importance')
    plt.bar(range(len(importances)), importances[indices], alpha=0.7)
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=45)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.show()

## 6. Generate Final Report

In [None]:
# Generate markdown report
report = analyzer.generate_markdown_report(analysis)

# Save report
with open('../analysis.md', 'w') as f:
    f.write(report)

print("Analysis report saved to ../analysis.md")
print("\nReport preview:")
print(report[:1000] + "...")

## Conclusion

This exploratory analysis demonstrates the DeFi credit scoring system's ability to:

1. **Process transaction data** from Aave V2 protocol
2. **Engineer meaningful features** that capture user behavior patterns
3. **Train ML models** to assign credit scores
4. **Generate comprehensive analysis** of scoring results

The system successfully identifies different risk profiles and behavioral patterns among DeFi users, providing a robust foundation for credit assessment in decentralized finance.