# Financial Analysis Model Development
**From Hasif's Workspace**

This notebook develops and evaluates machine learning models for financial analysis including spending segmentation, anomaly detection, and forecasting.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Preparation

In [None]:
# Load processed data
try:
    df = pd.read_csv('../data/processed_transactions.csv')
    df['date'] = pd.to_datetime(df['date'])
    print(f"Loaded {len(df)} transactions")
except FileNotFoundError:
    print("Processed data not found. Generating synthetic data...")
    import sys
    sys.path.append('../scripts')
    from generate_synthetic_data import generate_transactions
    df = generate_transactions(1000)
    df['date'] = pd.to_datetime(df['date'])

df.head()

## 2. Spending Behavior Segmentation with K-Means

In [None]:
# Prepare features for clustering
def prepare_clustering_features(df):
    features = ['amount']
    df_features = df.copy()
    
    # One-hot encode categorical features
    if 'category' in df_features.columns:
        category_dummies = pd.get_dummies(df_features['category'], prefix='cat')
        df_features = pd.concat([df_features, category_dummies], axis=1)
        features.extend(category_dummies.columns.tolist())
    
    if 'day_of_week' in df_features.columns:
        dow_dummies = pd.get_dummies(df_features['day_of_week'], prefix='day')
        df_features = pd.concat([df_features, dow_dummies], axis=1)
        features.extend(dow_dummies.columns.tolist())
    
    return df_features[features].fillna(0)

# Prepare features
X_clustering = prepare_clustering_features(df)
print(f"Clustering features shape: {X_clustering.shape}")
print(f"Features: {X_clustering.columns.tolist()[:10]}...")  # Show first 10 features

In [None]:
# Determine optimal number of clusters using elbow method
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Calculate WCSS for different cluster numbers
wcss = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    
    if len(X_scaled) > k:
        sil_score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(sil_score)
    else:
        silhouette_scores.append(0)

# Plot elbow curve and silhouette scores
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, wcss, 'bo-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, 'ro-')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)

plt.tight_layout()
plt.show()

# Find optimal k
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

In [None]:
# Train final K-means model
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = final_kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df_clustered = df.copy()
df_clustered['cluster'] = cluster_labels

# Analyze clusters
cluster_analysis = df_clustered.groupby('cluster').agg({
    'amount': ['count', 'mean', 'sum', 'std'],
    'category': lambda x: x.value_counts().index[0]  # Most common category
}).round(2)

print("Cluster Analysis:")
print(cluster_analysis)

# Save the model
joblib.dump(final_kmeans, '../models/kmeans_model.joblib')
joblib.dump(scaler, '../models/scaler_clustering.joblib')
print("\nK-means model and scaler saved!")

In [None]:
# Visualize clusters
plt.figure(figsize=(15, 5))

# Cluster distribution
plt.subplot(1, 3, 1)
cluster_counts = df_clustered['cluster'].value_counts().sort_index()
plt.bar(cluster_counts.index, cluster_counts.values)
plt.title('Cluster Distribution')
plt.xlabel('Cluster')
plt.ylabel('Number of Transactions')

# Amount by cluster
plt.subplot(1, 3, 2)
df_clustered.boxplot(column='amount', by='cluster', ax=plt.gca())
plt.title('Transaction Amount by Cluster')
plt.suptitle('')  # Remove default title

# Category distribution by cluster
plt.subplot(1, 3, 3)
cluster_category = pd.crosstab(df_clustered['cluster'], df_clustered['category'])
cluster_category_pct = cluster_category.div(cluster_category.sum(axis=1), axis=0)
sns.heatmap(cluster_category_pct, annot=True, fmt='.2f', cmap='Blues')
plt.title('Category Distribution by Cluster')

plt.tight_layout()
plt.show()

## 3. Anomaly Detection with Isolation Forest

In [None]:
# Prepare features for anomaly detection
def prepare_anomaly_features(df):
    features = ['amount']
    df_features = df.copy()
    
    # Add day of week as numerical
    if 'day_of_week' in df_features.columns:
        day_mapping = {
            'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
            'Friday': 4, 'Saturday': 5, 'Sunday': 6
        }
        df_features['day_of_week_num'] = df_features['day_of_week'].map(day_mapping)
        features.append('day_of_week_num')
    
    return df_features[features].dropna()

# Prepare features
X_anomaly = prepare_anomaly_features(df)
print(f"Anomaly detection features shape: {X_anomaly.shape}")
print(f"Features: {X_anomaly.columns.tolist()}")

In [None]:
# Train Isolation Forest model
scaler_anomaly = StandardScaler()
X_anomaly_scaled = scaler_anomaly.fit_transform(X_anomaly)

# Try different contamination values
contamination_values = [0.05, 0.1, 0.15, 0.2]
results = []

for contamination in contamination_values:
    iso_forest = IsolationForest(
        n_estimators=100,
        contamination=contamination,
        random_state=42
    )
    
    anomaly_labels = iso_forest.fit_predict(X_anomaly_scaled)
    anomaly_count = np.sum(anomaly_labels == -1)
    anomaly_percentage = (anomaly_count / len(X_anomaly)) * 100
    
    results.append({
        'contamination': contamination,
        'anomaly_count': anomaly_count,
        'anomaly_percentage': anomaly_percentage
    })

results_df = pd.DataFrame(results)
print("Contamination Analysis:")
print(results_df)

# Choose optimal contamination (e.g., 0.1 for 10%)
optimal_contamination = 0.1
print(f"\nUsing contamination: {optimal_contamination}")

In [None]:
# Train final Isolation Forest model
final_iso_forest = IsolationForest(
    n_estimators=100,
    contamination=optimal_contamination,
    random_state=42
)

anomaly_labels = final_iso_forest.fit_predict(X_anomaly_scaled)
anomaly_scores = final_iso_forest.decision_function(X_anomaly_scaled)

# Add results to dataframe
df_anomaly = df.iloc[:len(X_anomaly)].copy()
df_anomaly['anomaly_score'] = anomaly_scores
df_anomaly['is_anomaly'] = (anomaly_labels == -1)

# Analyze anomalies
anomalies = df_anomaly[df_anomaly['is_anomaly']]
print(f"Detected {len(anomalies)} anomalies ({len(anomalies)/len(df_anomaly)*100:.2f}%)")

if len(anomalies) > 0:
    print("\nTop 10 anomalous transactions:")
    top_anomalies = anomalies.nsmallest(10, 'anomaly_score')
    print(top_anomalies[['date', 'description', 'amount', 'category', 'anomaly_score']])

# Save the model
joblib.dump(final_iso_forest, '../models/isolation_forest_model.joblib')
joblib.dump(scaler_anomaly, '../models/scaler_anomaly.joblib')
print("\nIsolation Forest model and scaler saved!")

In [None]:
# Visualize anomalies
plt.figure(figsize=(15, 5))

# Anomaly score distribution
plt.subplot(1, 3, 1)
plt.hist(df_anomaly['anomaly_score'], bins=50, alpha=0.7)
plt.axvline(x=final_iso_forest.offset_, color='red', linestyle='--', label='Threshold')
plt.title('Anomaly Score Distribution')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.legend()

# Amount vs anomaly score
plt.subplot(1, 3, 2)
colors = ['red' if x else 'blue' for x in df_anomaly['is_anomaly']]
plt.scatter(df_anomaly['amount'], df_anomaly['anomaly_score'], c=colors, alpha=0.6)
plt.title('Amount vs Anomaly Score')
plt.xlabel('Transaction Amount')
plt.ylabel('Anomaly Score')

# Anomalies over time
plt.subplot(1, 3, 3)
anomaly_dates = anomalies['date']
anomaly_amounts = anomalies['amount']
plt.scatter(anomaly_dates, anomaly_amounts, color='red', alpha=0.7)
plt.title('Anomalous Transactions Over Time')
plt.xlabel('Date')
plt.ylabel('Amount')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. Time Series Analysis for Forecasting

In [None]:
# Prepare time series data
df_ts = df.copy()
df_ts['date'] = pd.to_datetime(df_ts['date'])
df_ts = df_ts.sort_values('date')

# Create daily spending series
daily_spending = df_ts[df_ts['transaction_type'] == 'Debit'].groupby('date')['amount'].sum()
daily_spending = daily_spending.asfreq('D', fill_value=0)

print(f"Time series length: {len(daily_spending)} days")
print(f"Date range: {daily_spending.index.min()} to {daily_spending.index.max()}")
print(f"Average daily spending: ${daily_spending.mean():.2f}")

In [None]:
# Time series visualization and basic analysis
plt.figure(figsize=(15, 10))

# Original time series
plt.subplot(2, 2, 1)
plt.plot(daily_spending.index, daily_spending.values)
plt.title('Daily Spending Time Series')
plt.xlabel('Date')
plt.ylabel('Amount ($)')

# Rolling statistics
plt.subplot(2, 2, 2)
rolling_mean = daily_spending.rolling(window=7).mean()
rolling_std = daily_spending.rolling(window=7).std()
plt.plot(daily_spending.index, daily_spending.values, alpha=0.3, label='Original')
plt.plot(rolling_mean.index, rolling_mean.values, label='7-day MA')
plt.fill_between(rolling_mean.index, 
                 rolling_mean - rolling_std, 
                 rolling_mean + rolling_std, 
                 alpha=0.2, label='±1 Std')
plt.title('Rolling Statistics')
plt.xlabel('Date')
plt.ylabel('Amount ($)')
plt.legend()

# Distribution
plt.subplot(2, 2, 3)
plt.hist(daily_spending.values, bins=30, alpha=0.7)
plt.title('Daily Spending Distribution')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')

# Autocorrelation (simple lag plot)
plt.subplot(2, 2, 4)
lag_1 = daily_spending.shift(1)
plt.scatter(lag_1, daily_spending, alpha=0.6)
plt.title('Lag-1 Autocorrelation')
plt.xlabel('Previous Day Spending')
plt.ylabel('Current Day Spending')

plt.tight_layout()
plt.show()

## 5. Model Evaluation and Summary

In [None]:
# Model summary
print("=" * 50)
print("MODEL DEVELOPMENT SUMMARY")
print("=" * 50)

print(f"\n1. SPENDING SEGMENTATION (K-Means)")
print(f"   - Optimal clusters: {optimal_k}")
print(f"   - Silhouette score: {max(silhouette_scores):.3f}")
print(f"   - Features used: {len(X_clustering.columns)}")

print(f"\n2. ANOMALY DETECTION (Isolation Forest)")
print(f"   - Contamination rate: {optimal_contamination}")
print(f"   - Anomalies detected: {len(anomalies)} ({len(anomalies)/len(df_anomaly)*100:.2f}%)")
print(f"   - Features used: {len(X_anomaly.columns)}")

print(f"\n3. TIME SERIES ANALYSIS")
print(f"   - Time series length: {len(daily_spending)} days")
print(f"   - Average daily spending: ${daily_spending.mean():.2f}")
print(f"   - Spending volatility (std): ${daily_spending.std():.2f}")

print(f"\n4. SAVED MODELS")
print(f"   - K-means model: ../models/kmeans_model.joblib")
print(f"   - Clustering scaler: ../models/scaler_clustering.joblib")
print(f"   - Isolation Forest: ../models/isolation_forest_model.joblib")
print(f"   - Anomaly scaler: ../models/scaler_anomaly.joblib")

print("\n" + "=" * 50)
print("Models ready for deployment!")
print("=" * 50)