In [None]:
# Random Forest Regressor for DBT Transfer Prediction
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the dataset
df = pd.read_csv('../clean_dbt_district_wise.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nDataset Statistics:")
print(df.describe())


In [None]:
# Data Preprocessing
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Handle missing values if any
df = df.dropna()

# Create feature matrix and target variable
# Features: state, district, year, transactions
# Target: total_dbt_transfer

# Encode categorical variables
le_state = LabelEncoder()
le_district = LabelEncoder()

# Create a copy for encoding
df_encoded = df.copy()
df_encoded['state_encoded'] = le_state.fit_transform(df['state_name'])
df_encoded['district_encoded'] = le_district.fit_transform(df['district_name'])

# Extract year from fy column
df_encoded['year'] = df_encoded['start_year']

# Prepare features and target
feature_columns = ['state_encoded', 'district_encoded', 'year', 'no_of_dbt_transactions']
X = df_encoded[feature_columns]
y = df_encoded['total_dbt_transfer']

print(f"\nFeature columns: {feature_columns}")
print(f"Target variable: total_dbt_transfer")
print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Display feature statistics
print("\nFeature Statistics:")
print(X.describe())


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training target range: {y_train.min():,.0f} to {y_train.max():,.0f}")
print(f"Test target range: {y_test.min():,.0f} to {y_test.max():,.0f}")


In [None]:
# Build Random Forest Regressor Model
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=100,           # Number of trees
    max_depth=20,               # Maximum depth of trees
    min_samples_split=5,        # Minimum samples to split a node
    min_samples_leaf=2,         # Minimum samples in a leaf
    random_state=42,            # For reproducibility
    n_jobs=-1                   # Use all available cores
)

# Train the model
print("Training Random Forest Regressor...")
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("Model training completed!")
print(f"Number of trees: {rf_model.n_estimators}")
print(f"Feature importance shape: {rf_model.feature_importances_.shape}")


In [None]:
# 1. Feature Importance Plot
# Get feature importance scores
feature_importance = rf_model.feature_importances_
feature_names = ['State', 'District', 'Year', 'Number of Transactions']

# Create feature importance plot
plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(feature_importance)), feature_importance, 
               color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])

# Customize the plot
plt.title('Feature Importance in Random Forest Model', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance Score', fontsize=12)
plt.xticks(range(len(feature_names)), feature_names, rotation=45)

# Add value labels on bars
for i, (bar, importance) in enumerate(zip(bars, feature_importance)):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
             f'{importance:.3f}', ha='center', va='bottom', fontweight='bold')

# Add grid for better readability
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Print feature importance values
print("Feature Importance Scores:")
for name, importance in zip(feature_names, feature_importance):
    print(f"{name}: {importance:.4f}")


In [None]:
# 2. Actual vs Predicted Scatterplot
# Create subplots for training and test sets
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Training set plot
ax1.scatter(y_train, y_train_pred, alpha=0.6, color='#FF6B6B', s=50)
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
ax1.set_xlabel('Actual Total DBT Transfer', fontsize=12)
ax1.set_ylabel('Predicted Total DBT Transfer', fontsize=12)
ax1.set_title('Training Set: Actual vs Predicted', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Test set plot
ax2.scatter(y_test, y_test_pred, alpha=0.6, color='#4ECDC4', s=50)
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax2.set_xlabel('Actual Total DBT Transfer', fontsize=12)
ax2.set_ylabel('Predicted Total DBT Transfer', fontsize=12)
ax2.set_title('Test Set: Actual vs Predicted', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Add R² scores to the plots
from sklearn.metrics import r2_score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

ax1.text(0.05, 0.95, f'R² = {train_r2:.3f}', transform=ax1.transAxes, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), fontsize=12)
ax2.text(0.05, 0.95, f'R² = {test_r2:.3f}', transform=ax2.transAxes, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), fontsize=12)

plt.tight_layout()
plt.show()

print(f"Training R² Score: {train_r2:.4f}")
print(f"Test R² Score: {test_r2:.4f}")


In [None]:
# 3. Residual Plot
# Calculate residuals
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred

# Create residual plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Training set residual plot
ax1.scatter(y_train_pred, train_residuals, alpha=0.6, color='#FF6B6B', s=50)
ax1.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax1.set_xlabel('Predicted Total DBT Transfer', fontsize=12)
ax1.set_ylabel('Residuals (Actual - Predicted)', fontsize=12)
ax1.set_title('Training Set: Residual Plot', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Test set residual plot
ax2.scatter(y_test_pred, test_residuals, alpha=0.6, color='#4ECDC4', s=50)
ax2.axhline(y=0, color='red', linestyle='--', linewidth=2)
ax2.set_xlabel('Predicted Total DBT Transfer', fontsize=12)
ax2.set_ylabel('Residuals (Actual - Predicted)', fontsize=12)
ax2.set_title('Test Set: Residual Plot', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

# Add residual statistics
train_residual_std = np.std(train_residuals)
test_residual_std = np.std(test_residuals)

ax1.text(0.05, 0.95, f'Residual Std: {train_residual_std:.0f}', transform=ax1.transAxes, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), fontsize=12)
ax2.text(0.05, 0.95, f'Residual Std: {test_residual_std:.0f}', transform=ax2.transAxes, 
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), fontsize=12)

plt.tight_layout()
plt.show()

print(f"Training Residual Standard Deviation: {train_residual_std:.2f}")
print(f"Test Residual Standard Deviation: {test_residual_std:.2f}")

# Check for patterns in residuals
print(f"\nResidual Analysis:")
print(f"Training residuals mean: {np.mean(train_residuals):.2f}")
print(f"Test residuals mean: {np.mean(test_residuals):.2f}")
print(f"Training residuals range: {np.min(train_residuals):.2f} to {np.max(train_residuals):.2f}")
print(f"Test residuals range: {np.min(test_residuals):.2f} to {np.max(test_residuals):.2f}")


In [None]:
# 4. Model Performance Evaluation
# Calculate comprehensive performance metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Training set metrics
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Test set metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Create performance comparison table
performance_data = {
    'Metric': ['R² Score', 'RMSE', 'MAE', 'MSE'],
    'Training': [train_r2, train_rmse, train_mae, train_mse],
    'Test': [test_r2, test_rmse, test_mae, test_mse]
}

performance_df = pd.DataFrame(performance_data)
print("Model Performance Metrics:")
print(performance_df.round(4))

# Create a visual comparison of metrics
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(performance_data['Metric']))
width = 0.35

bars1 = ax.bar(x - width/2, performance_data['Training'], width, label='Training', color='#FF6B6B', alpha=0.8)
bars2 = ax.bar(x + width/2, performance_data['Test'], width, label='Test', color='#4ECDC4', alpha=0.8)

ax.set_xlabel('Metrics', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(performance_data['Metric'])
ax.legend()
ax.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{height:.3f}', ha='center', va='bottom', fontsize=10)
for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{height:.3f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

# Model interpretation
print(f"\nModel Interpretation:")
print(f"• The model explains {test_r2:.1%} of the variance in total DBT transfers")
print(f"• Average prediction error: ₹{test_mae:,.0f}")
print(f"• Root mean square error: ₹{test_rmse:,.0f}")
if test_r2 > 0.8:
    print("• Excellent model performance!")
elif test_r2 > 0.6:
    print("• Good model performance!")
elif test_r2 > 0.4:
    print("• Moderate model performance")
else:
    print("• Model needs improvement")


In [None]:
# 5. Additional Analysis: Prediction Examples
# Show some example predictions
print("Sample Predictions:")
print("="*50)

# Get some random samples from test set
sample_indices = np.random.choice(len(y_test), 10, replace=False)
sample_predictions = pd.DataFrame({
    'Actual': y_test.iloc[sample_indices].values,
    'Predicted': y_test_pred[sample_indices],
    'Error': y_test.iloc[sample_indices].values - y_test_pred[sample_indices],
    'Error_Percentage': ((y_test.iloc[sample_indices].values - y_test_pred[sample_indices]) / y_test.iloc[sample_indices].values) * 100
})

print(sample_predictions.round(2))

# Feature importance analysis
print(f"\nFeature Importance Analysis:")
print("="*40)
for i, (feature, importance) in enumerate(zip(feature_names, feature_importance)):
    print(f"{i+1}. {feature}: {importance:.4f} ({importance*100:.1f}%)")

# Model summary
print(f"\nModel Summary:")
print("="*20)
print(f"• Algorithm: Random Forest Regressor")
print(f"• Number of trees: {rf_model.n_estimators}")
print(f"• Max depth: {rf_model.max_depth}")
print(f"• Features used: {len(feature_columns)}")
print(f"• Training samples: {len(X_train)}")
print(f"• Test samples: {len(X_test)}")
print(f"• Model performance: {'Excellent' if test_r2 > 0.8 else 'Good' if test_r2 > 0.6 else 'Moderate' if test_r2 > 0.4 else 'Needs Improvement'}")
