### The changes
- Apply Target Encoding to the categorical columns
- Increase the data size from 3000 to 30000
- Apply Random Forest algorithm when filling in NaNs with suitable values
- SHAP with XGBoost and Permutation Importance were applied

In [None]:
!pip install xgboost
!pip install category_encoders
!pip install shap

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
import shap
from sklearn.inspection import permutation_importance

data = pd.read_csv('./data/Coordinates_Preprocessed.csv', encoding='utf-8')


Collecting category_encoders
  Obtaining dependency information for category_encoders from https://files.pythonhosted.org/packages/1f/e2/495811f12b2e90753fff0e42a07adb0370a725de17cc23a579ac9d3ca67c/category_encoders-2.6.2-py2.py3-none-any.whl.metadata
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting importlib-resources (from category_encoders)
  Downloading importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Collecting zipp>=3.1.0 (from importlib-resources->category_encoders)
  Downloading zipp-3.15.0-py3-none-any.whl (6.8 kB)
Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 81.8/81.8 kB 4.5 MB/s eta 0:00:00
Installing collected packages: zipp, importlib-resources, category_encoders
  Attempting uninstall: zipp
    Found existing installation: zipp 2.2.0
    Uninstalling zipp-2.2.0:
      Successfully uninstalled zipp-2.2.0
Successfully installed category_encoders-2.6.2 importlib-resource

DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.
spyder 4.0.1 requires pyqt5<5.13; python_version >= "3", which is not installed.
spyder 4.0.1 requires pyqtwebengine<5.13; python_version >= "3", which is not installed.


Collecting shap
  Obtaining dependency information for shap from https://files.pythonhosted.org/packages/84/9e/88ca34c2c79cd673df32161918707f8257e41fa0d70bcbb8c7b6026c36db/shap-0.42.1-cp37-cp37m-win_amd64.whl.metadata
  Downloading shap-0.42.1-cp37-cp37m-win_amd64.whl.metadata (24 kB)
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Downloading shap-0.42.1-cp37-cp37m-win_amd64.whl (462 kB)
   ---------------------------------------- 462.1/462.1 kB 7.3 MB/s eta 0:00:00
Installing collected packages: slicer, shap
Successfully installed shap-0.42.1 slicer-0.0.7


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063


ModuleNotFoundError: No module named 'xgboost'

In [None]:
data.info()
data

In [4]:
# Define your features and target variable
X = data.drop(columns=['JS_Price'])
y = data['JS_Price']

# Identify categorical columns that may benefit from Target Encoding
categorical_columns = ['Region_Name', 'Building_Use', 'YearMonth']

# Apply Target Encoding to the selected categorical columns
encoder = TargetEncoder(cols=categorical_columns)
X_encoded = encoder.fit_transform(X, y)

# Normalize your data if needed (adjust normalization logic as per your requirements)
def normalize(data):
    return (data - data.min()) / (data.max() - data.min())

X_normalized = normalize(X_encoded)

In [5]:


# 1. SHAP with XGBoost
# Train an XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_normalized, y)

# Create a SHAP explainer for the trained XGBoost model
explainer_shap = shap.Explainer(xgb_model)
shap_values = explainer_shap.shap_values(X_normalized)

# Compute absolute SHAP values for feature ranking
shap_scores_abs = np.abs(shap_values).mean(axis=0)
shap_ranking = pd.Series(shap_scores_abs, index=X_encoded.columns).sort_values(ascending=False)





# 2. Permutation Importance
# Compute permutation importance
result_permutation = permutation_importance(xgb_model, X_normalized, y, n_repeats=10, random_state=42)
permutation_importance_scores = result_permutation.importances_mean
permutation_ranking = pd.Series(permutation_importance_scores, index=X_encoded.columns).sort_values(ascending=False)






# 3. Mutual Information
# Compute mutual information scores for selected features
mi_scores = mutual_info_regression(X_normalized, y)
mi_ranking = pd.Series(mi_scores, index=X_encoded.columns).sort_values(ascending=False)






# 4. Linear Regression
# Train a linear regression model on all features
linear_model = LinearRegression()
linear_model.fit(X_normalized, y)

# Calculate linear regression coefficients for all features
linear_regression_coefficients = pd.Series(linear_model.coef_, index=X_encoded.columns).abs().sort_values(ascending=False)





# 5. Random Forest
# Train a Random Forest model on all features
rf_model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_rf.fit(X_normalized, y)

# Compute feature importances from Random Forest for all features
rf_feature_importances = pd.Series(rf_model_rf.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)


In [6]:



# Define a function for min-max scaling
min_max_scaler_mi_linear_rf_shap_permutation = lambda x: (x - x.min()) / (x.max() - x.min())

# Normalize feature importance rankings for mutual information (MI), linear regression, Random Forest (RF), SHAP, and permutation importance
normalized_mi_ranking = min_max_scaler_mi_linear_rf_shap_permutation(mi_ranking)
normalized_linear_ranking = min_max_scaler_mi_linear_rf_shap_permutation(linear_regression_coefficients.abs())
normalized_rf_ranking = min_max_scaler_mi_linear_rf_shap_permutation(rf_feature_importances)
normalized_shap_ranking = min_max_scaler_mi_linear_rf_shap_permutation(shap_ranking)
normalized_permutation_ranking = min_max_scaler_mi_linear_rf_shap_permutation(permutation_ranking)

# Define weights for each feature importance metric
weight_mi = 0.2
weight_linear = 0.3
weight_rf = 0.3
weight_shap = 0.1
weight_permutation = 0.1

# Combine normalized rankings using weighted averages
combined_ranking = (
    weight_mi * normalized_mi_ranking +
    weight_linear * normalized_linear_ranking +
    weight_rf * normalized_rf_ranking +
    weight_shap * normalized_shap_ranking +
    weight_permutation * normalized_permutation_ranking
)

# Create a DataFrame with feature names and combined rankings
combined_ranking_df = pd.DataFrame({
    'Feature': combined_ranking.index,
    'Combined_Ranking': combined_ranking.values
})

# Sort the features by combined ranking in descending order
sorted_features_combined = combined_ranking_df.sort_values(by='Combined_Ranking', ascending=False)

# Store the sorted features and their rankings in Rankings_Features
Rankings_Features = sorted_features_combined

# Display the feature rankings
Rankings_Features


Unnamed: 0,Feature,Combined_Ranking
11,Sell_Price,1.0
6,JS_BA,0.533719
0,Building_Age,0.174091
7,LC_index,0.111755
9,Region_Name,0.1061
8,Population,0.071044
14,Shortest_Distance_to_Subway,0.050947
4,HSP_index,0.048754
13,Shortest_Distance_to_School,0.048685
3,Crime_Rates,0.04704
