In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Load the data
print("Loading data...")
df = pd.read_csv('project_data/PAS+crime+extra_final.csv')

# Drop rows where 'Q60' is NaN and filter out 'Q60' values >= 95
print("Cleaning data...")
df.dropna(subset=['Q60'], inplace=True)
df = df[df['Q60'] < 95]

# Define features and target
print("Defining features and target...")
X = df.drop(columns='Q60')
y = df['Q60']

# Get unique boroughs
print("Identifying unique boroughs...")
boroughs = df['Borough'].unique()
print(f"Found {len(boroughs)} unique boroughs.")

# Dictionary to store models and results
models = {}
evals_results = {}
rmses = {}

# Iterate over each borough and train a model
for borough in boroughs:
    print(f"Processing borough: {borough}")
    
    # Filter the data for the current borough
    borough_data = df[df['Borough'] == borough]
    X_borough = borough_data.drop(columns=['Q60', 'Borough'])
    y_borough = borough_data['Q60']
    
    # Split the data into training and testing sets
    print(f"Splitting data for borough: {borough}")
    X_train, X_test, y_train, y_test = train_test_split(X_borough, y_borough, test_size=0.2, random_state=42)
    
    # Convert data into DMatrix format for XGBoost
    print(f"Converting data to DMatrix format for borough: {borough}")
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Define parameters
    params = {
        'objective': 'reg:squarederror',  # Use squared error for regression task
        'random_state': 42
    }
    
    # Train the model
    print(f"Training model for borough: {borough}")
    evals_result = {}
    model = xgb.train(params, dtrain, num_boost_round=100,
                      evals=[(dtrain, 'train'), (dtest, 'test')],
                      early_stopping_rounds=10, verbose_eval=False, evals_result=evals_result)
    
    # Make predictions on the test set
    print(f"Making predictions for borough: {borough}")
    y_pred = model.predict(dtest)
    
    # Calculate and store RMSE
    print(f"Calculating RMSE for borough: {borough}")
    rmse = root_mean_squared_error(y_test, y_pred)
    
    # Store the model, evals_result, and RMSE
    models[borough] = model
    evals_results[borough] = {'test': {'X': X_test, 'y': y_test}}
    rmses[borough] = rmse
    
    print(f"RMSE for {borough}: {rmse}")

# Print all RMSes
print("RMSE for all boroughs:")
for borough, rmse in rmses.items():
    print(f"{borough}: {rmse}")

# Dictionary to store permutation importances for each borough
perm_importances_dict = {}

# Iterate over each borough and calculate permutation importances
for borough in boroughs:
    print(f"Calculating permutation importances for borough: {borough}")
    model = models[borough]
    X_test = evals_results[borough]['test']['X']
    y_test = evals_results[borough]['test']['y']
    baseline_rmse = rmses[borough]
    
    # Initialize an empty array to store permutation importances
    perm_importances = np.zeros(X_test.shape[1])
    
    # Iterate over each feature
    for i in tqdm(range(X_test.shape[1]), desc=f"Calculating Permutation Importance for {borough}"):
        # Make a copy of the test data
        X_test_permuted = X_test.copy()
        # Permute the values of the current feature
        X_test_permuted.iloc[:, i] = np.random.permutation(X_test_permuted.iloc[:, i])
        # Make predictions on the permuted data
        y_pred_permuted = model.predict(xgb.DMatrix(X_test_permuted))
        # Compute RMSE on the permuted data
        permuted_rmse = root_mean_squared_error(y_test, y_pred_permuted)
        # Compute permutation importance for the current feature
        perm_importance = baseline_rmse - permuted_rmse
        # Store the permutation importance
        perm_importances[i] = perm_importance
    
    # Sort the permutation importances
    sorted_idx = np.argsort(perm_importances)[::-1]
    
    # Store the permutation importances
    perm_importances_dict[borough] = perm_importances
    
    # Print the top 20 features by permutation importance for the current borough
    print(f"Top 20 Feature Importance for {borough}:")
    for i in sorted_idx[:20]:
        print(f"{X.columns[i]}: {perm_importances[i]:.4f}")

print("Debugging complete.")
