In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [15]:
df = pd.read_csv('PAS+crime+extra_final.csv')

In [16]:
df.shape[0]

75912

In [17]:
df.dropna(subset=['Q60'], inplace=True)
df = df[df['Q60'] < 95]
print(df.shape[0])

73124


In [18]:
# Define features and target
X = df.drop(columns='Q60')
y = df['Q60']

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # Use squared error for regression task
    'random_state': 42
}

# Convert training data into DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)

# Convert testing data into DMatrix format for XGBoost
dtest = xgb.DMatrix(X_test, label=y_test)

# Number of boosting rounds
num_boost_round = 100

# Train the model and print the progress of boosting rounds
evals_result = {}
model = xgb.train(params, dtrain, num_boost_round=num_boost_round,
                  evals=[(dtrain, 'train'), (dtest, 'test')],
                  early_stopping_rounds=10, verbose_eval=10, evals_result=evals_result)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Calculate and print RMSE (Root Mean Squared Error) on the test set
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE on test set:", rmse)


[0]	train-rmse:0.71611	test-rmse:0.72859
[10]	train-rmse:0.53781	test-rmse:0.57515
[20]	train-rmse:0.51938	test-rmse:0.57224
[30]	train-rmse:0.50881	test-rmse:0.57165
[40]	train-rmse:0.49697	test-rmse:0.57193
[45]	train-rmse:0.49280	test-rmse:0.57201
RMSE on test set: 0.5720122701371951


In [20]:
# Compute baseline RMSE
baseline_rmse = rmse

# Initialize an empty array to store permutation importances
perm_importances = np.zeros(X_test.shape[1])

# Iterate over each feature
for i in tqdm(range(X_test.shape[1]), desc="Calculating Permutation Importance"):
    # Make a copy of the test data
    X_test_permuted = X_test.copy()
    # Permute the values of the current feature
    X_test_permuted.iloc[:, i] = np.random.permutation(X_test_permuted.iloc[:, i])
    # Make predictions on the permuted data
    y_pred_permuted = model.predict(xgb.DMatrix(X_test_permuted))
    # Compute RMSE on the permuted data
    permuted_rmse = mean_squared_error(y_test, y_pred_permuted, squared=False)
    # Compute permutation importance for the current feature
    perm_importance = baseline_rmse - permuted_rmse
    # Store the permutation importance
    perm_importances[i] = perm_importance

# Sort the permutation importances
sorted_idx = np.argsort(perm_importances)[::-1]

# Print the top 20 features by permutation importance
print("Top 20 Feature Importance:")
for i in sorted_idx[:20]:
    print(f"{X.columns[i]}: {perm_importances[i]:.4f}")


Calculating Permutation Importance: 100%|█████| 670/670 [05:03<00:00,  2.21it/s]

Top 20 Feature Importance:
WT_Q63_B: 0.0003
SQ79DB: 0.0003
NQ135BDE: 0.0002
WT_Q58: 0.0002
XQ122A: 0.0002
NQCV28: 0.0002
Violence and sexual offences: 0.0002
XQ128A2B: 0.0001
WT_Q62: 0.0001
WT_Q64_B: 0.0001
NNQ27E: 0.0001
Q141: 0.0001
Borough: 0.0001
Vehicle crime: 0.0001
WT_Q44_E: 0.0001
NQCV7: 0.0001
Q62TI: 0.0001
WT_Q45_B: 0.0001
SQ132K: 0.0001
NQCV1: 0.0001



