### The changes
- Apply Target Encoding to the categorical columns
- Increase the data size from 3000 to 30000
- Apply Random Forest algorithm when filling in NaNs with suitable values
- SHAP with XGBoost and Permutation Importance were applied

In [1]:
!pip install category_encoders
!pip install shap
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import mutual_info_regression, SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
import shap
from sklearn.inspection import permutation_importance

data = pd.read_csv('/content/Before_Encoding_5000.csv', encoding='euc-kr')




Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
data['YearMonth'] = pd.to_datetime(data['YearMonth'], format='%Y%m')
data


Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ
0,14,22500,84.70,433809,4.1,90.4,95.3,91.0,107.634598,91.7,39900.00,0.967620,1.25,강동구,아파트,2017-03-01,218.546661,342.320637,2080.047982
1,0,16000,17.45,662019,3.4,98.0,101.1,99.1,112.039216,131.7,18000.00,0.834577,1.25,송파구,오피스텔,2019-12-01,365.167081,428.396368,2078.432085
2,30,42000,108.47,553927,2.7,78.0,84.3,81.7,120.439963,74.7,135000.00,1.537764,2.50,강남구,아파트,2013-10-01,698.127221,334.807784,1514.222790
3,4,48000,84.95,674828,2.9,72.9,80.0,77.1,114.366829,79.4,91646.15,1.145652,3.25,송파구,아파트,2011-10-01,536.947700,24.176463,3817.518298
4,0,70000,84.99,302243,2.1,109.0,109.0,110.3,87.677816,167.9,108000.00,0.725826,2.50,서대문구,아파트,2022-08-01,1173.890039,335.949816,1165.416466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,133000,84.86,530126,3.4,102.6,101.3,100.8,128.819696,158.1,193333.33,1.352069,0.50,강남구,아파트,2020-11-01,1341.605321,298.254673,3567.318940
4996,10,49000,84.91,427540,3.1,74.5,81.5,78.5,102.461258,74.2,81850.00,1.221012,3.00,서초구,아파트,2012-07-01,440.715060,269.506677,1053.568719
4997,0,23000,30.00,425539,4.5,93.9,98.0,94.1,94.786910,106.1,27038.00,0.907344,1.50,강동구,연립다세대,2018-03-01,364.897534,391.843327,1835.115994
4998,0,71000,84.65,571614,3.8,106.9,103.8,103.4,117.233889,177.7,110000.00,0.671993,0.50,강서구,아파트,2021-06-01,0.000000,809.669099,2549.064034


In [3]:
# Sorting by the columns in descending order
data = data.sort_values(by=['YearMonth', 'Region_Name', 'Building_Use'], ascending=[True,True,True])

data

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ
766,13,43000,113.00,557563,3.800000,72.000000,78.200000,74.200000,114.344505,81.200000,54000.00,1.179391,2.75,강남구,아파트,2011-01-01,522.643644,404.136438,2717.110673
864,12,17500,59.96,486546,3.800000,72.000000,78.200000,74.200000,114.538757,81.200000,32700.00,1.198929,2.75,강동구,아파트,2011-01-01,818.829456,297.921220,1639.799834
1893,0,22000,84.83,486546,3.800000,72.000000,78.200000,74.200000,112.364655,81.200000,37259.50,1.196367,2.75,강동구,아파트,2011-01-01,747.428903,293.062666,5348.572633
4938,25,21000,84.91,486546,3.800000,72.000000,78.200000,74.200000,114.367050,81.200000,73950.00,1.136842,2.75,강동구,아파트,2011-01-01,288.600945,278.705356,3964.052557
3228,8,20500,84.71,339529,3.800000,72.000000,78.200000,74.200000,114.253929,81.200000,36811.54,1.163344,2.75,강북구,아파트,2011-01-01,813.256037,123.423393,423.404193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2560,14,22000,61.12,463298,3.396246,102.655098,101.286736,100.780952,128.892242,157.060776,30000.00,0.876201,3.50,은평구,연립다세대,2023-08-01,403.928032,242.558160,1248.315088
2196,6,75000,72.97,121441,3.399868,102.644089,101.288506,100.840019,128.765762,158.125519,149800.00,0.779714,3.50,중구,아파트,2023-08-01,581.172513,104.769954,842.570488
4606,15,54000,84.90,121441,4.100200,103.102043,101.486321,101.099960,133.370956,161.393890,87000.00,1.162222,3.50,중구,아파트,2023-08-01,604.059863,457.714066,1629.373539
51,21,19500,57.69,384272,3.399715,102.641594,101.314041,100.785660,128.765762,158.030884,27500.00,0.790527,3.50,중랑구,연립다세대,2023-08-01,1261.635711,206.651345,1741.126762


In [5]:
columns_to_drop = ['Building_Use']
data = data.drop(columns = columns_to_drop)
data

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ
766,13,43000,113.00,557563,3.800000,72.000000,78.200000,74.200000,114.344505,81.200000,54000.00,1.179391,2.75,강남구,2011-01-01,522.643644,404.136438,2717.110673
864,12,17500,59.96,486546,3.800000,72.000000,78.200000,74.200000,114.538757,81.200000,32700.00,1.198929,2.75,강동구,2011-01-01,818.829456,297.921220,1639.799834
1893,0,22000,84.83,486546,3.800000,72.000000,78.200000,74.200000,112.364655,81.200000,37259.50,1.196367,2.75,강동구,2011-01-01,747.428903,293.062666,5348.572633
4938,25,21000,84.91,486546,3.800000,72.000000,78.200000,74.200000,114.367050,81.200000,73950.00,1.136842,2.75,강동구,2011-01-01,288.600945,278.705356,3964.052557
3228,8,20500,84.71,339529,3.800000,72.000000,78.200000,74.200000,114.253929,81.200000,36811.54,1.163344,2.75,강북구,2011-01-01,813.256037,123.423393,423.404193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2560,14,22000,61.12,463298,3.396246,102.655098,101.286736,100.780952,128.892242,157.060776,30000.00,0.876201,3.50,은평구,2023-08-01,403.928032,242.558160,1248.315088
2196,6,75000,72.97,121441,3.399868,102.644089,101.288506,100.840019,128.765762,158.125519,149800.00,0.779714,3.50,중구,2023-08-01,581.172513,104.769954,842.570488
4606,15,54000,84.90,121441,4.100200,103.102043,101.486321,101.099960,133.370956,161.393890,87000.00,1.162222,3.50,중구,2023-08-01,604.059863,457.714066,1629.373539
51,21,19500,57.69,384272,3.399715,102.641594,101.314041,100.785660,128.765762,158.030884,27500.00,0.790527,3.50,중랑구,2023-08-01,1261.635711,206.651345,1741.126762


In [6]:
data.info()
data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 766 to 2544
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Building_Age                 5000 non-null   int64         
 1   JS_Price                     5000 non-null   int64         
 2   JS_BA                        5000 non-null   float64       
 3   Population                   5000 non-null   int64         
 4   UR                           5000 non-null   float64       
 5   LC_index                     5000 non-null   float64       
 6   CA_index                     5000 non-null   float64       
 7   TC_index                     5000 non-null   float64       
 8   SDT_index                    5000 non-null   float64       
 9   HSP_index                    5000 non-null   float64       
 10  Sell_Price                   5000 non-null   float64       
 11  Crime_Rates                  5000 non-nul

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ
766,13,43000,113.00,557563,3.800000,72.000000,78.200000,74.200000,114.344505,81.200000,54000.00,1.179391,2.75,강남구,2011-01-01,522.643644,404.136438,2717.110673
864,12,17500,59.96,486546,3.800000,72.000000,78.200000,74.200000,114.538757,81.200000,32700.00,1.198929,2.75,강동구,2011-01-01,818.829456,297.921220,1639.799834
1893,0,22000,84.83,486546,3.800000,72.000000,78.200000,74.200000,112.364655,81.200000,37259.50,1.196367,2.75,강동구,2011-01-01,747.428903,293.062666,5348.572633
4938,25,21000,84.91,486546,3.800000,72.000000,78.200000,74.200000,114.367050,81.200000,73950.00,1.136842,2.75,강동구,2011-01-01,288.600945,278.705356,3964.052557
3228,8,20500,84.71,339529,3.800000,72.000000,78.200000,74.200000,114.253929,81.200000,36811.54,1.163344,2.75,강북구,2011-01-01,813.256037,123.423393,423.404193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2560,14,22000,61.12,463298,3.396246,102.655098,101.286736,100.780952,128.892242,157.060776,30000.00,0.876201,3.50,은평구,2023-08-01,403.928032,242.558160,1248.315088
2196,6,75000,72.97,121441,3.399868,102.644089,101.288506,100.840019,128.765762,158.125519,149800.00,0.779714,3.50,중구,2023-08-01,581.172513,104.769954,842.570488
4606,15,54000,84.90,121441,4.100200,103.102043,101.486321,101.099960,133.370956,161.393890,87000.00,1.162222,3.50,중구,2023-08-01,604.059863,457.714066,1629.373539
51,21,19500,57.69,384272,3.399715,102.641594,101.314041,100.785660,128.765762,158.030884,27500.00,0.790527,3.50,중랑구,2023-08-01,1261.635711,206.651345,1741.126762


In [7]:
# Define your features and target variable
X = data.drop(columns=['JS_Price'])
y = data['JS_Price']

# Identify categorical columns that may benefit from Target Encoding
categorical_columns = ['Region_Name', 'YearMonth']

# Apply Target Encoding to the selected categorical columns
encoder = TargetEncoder(cols=categorical_columns)
X_encoded = encoder.fit_transform(X, y)

# Normalize your data if needed (adjust normalization logic as per your requirements)
def normalize(data):
    return (data - data.min()) / (data.max() - data.min())

X_normalized = normalize(X_encoded)

In [8]:


# 1. SHAP with XGBoost
# Train an XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_normalized, y)

# Create a SHAP explainer for the trained XGBoost model
explainer_shap = shap.Explainer(xgb_model)
shap_values = explainer_shap.shap_values(X_normalized)

# Compute absolute SHAP values for feature ranking
shap_scores_abs = np.abs(shap_values).mean(axis=0)
shap_ranking = pd.Series(shap_scores_abs, index=X_encoded.columns).sort_values(ascending=False)





# 2. Permutation Importance
# Compute permutation importance
result_permutation = permutation_importance(xgb_model, X_normalized, y, n_repeats=10, random_state=42)
permutation_importance_scores = result_permutation.importances_mean
permutation_ranking = pd.Series(permutation_importance_scores, index=X_encoded.columns).sort_values(ascending=False)







# 3. Mutual Information
# Compute mutual information scores for selected features
mi_scores = mutual_info_regression(X_normalized, y)
mi_ranking = pd.Series(mi_scores, index=X_encoded.columns).sort_values(ascending=False)








# 4. Random Forest
# Train a Random Forest model on all features
rf_model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_rf.fit(X_normalized, y)

# Compute feature importances from Random Forest for all features
rf_feature_importances = pd.Series(rf_model_rf.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)


In [9]:



# Define a function for min-max scaling
min_max_scaler_mi_linear_rf_shap_permutation = lambda x: (x - x.min()) / (x.max() - x.min())

# Normalize feature importance rankings for mutual information (MI), linear regression, Random Forest (RF), SHAP, and permutation importance
normalized_mi_ranking = min_max_scaler_mi_linear_rf_shap_permutation(mi_ranking)
normalized_rf_ranking = min_max_scaler_mi_linear_rf_shap_permutation(rf_feature_importances)
normalized_shap_ranking = min_max_scaler_mi_linear_rf_shap_permutation(shap_ranking)
normalized_permutation_ranking = min_max_scaler_mi_linear_rf_shap_permutation(permutation_ranking)

# Define weights for each feature importance metric
weight_mi = 0.25
weight_rf = 0.25
weight_shap = 0.25
weight_permutation = 0.25

# Combine normalized rankings using weighted averages
combined_ranking = (
    weight_mi * normalized_mi_ranking +
    weight_rf * normalized_rf_ranking +
    weight_shap * normalized_shap_ranking +
    weight_permutation * normalized_permutation_ranking
)

# Create a DataFrame with feature names and combined rankings
combined_ranking_df = pd.DataFrame({
    'Feature': combined_ranking.index,
    'Combined_Ranking': combined_ranking.values
})

# Sort the features by combined ranking in descending order
sorted_features_combined = combined_ranking_df.sort_values(by='Combined_Ranking', ascending=False)

# Store the sorted features and their rankings in Rankings_Features
Rankings_Features = sorted_features_combined

# Display the feature rankings
Rankings_Features


Unnamed: 0,Feature,Combined_Ranking
10,Sell_Price,1.0
5,JS_BA,0.565105
0,Building_Age,0.176229
8,Region_Name,0.133732
6,LC_index,0.088505
7,Population,0.07995
16,YearMonth,0.047187
12,Shortest_Distance_to_Subway,0.044961
2,Crime_Rates,0.042476
13,Shortest_Distance_to_Univ,0.040679
