### The changes
- Apply Label Encoding to the categorical columns
- Increase the data size from 3000 to 30000
- Apply Random Forest algorithm when filling in NaNs with suitable values

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Load your dataset
data = pd.read_csv('/content/Preprocessed_Total_Data_Extracted.csv', encoding='utf-8')

def normalize(data):
    for column in data.columns:
        if data[column].dtype == float:
            if column != 'JS_Price':
                data[column] = (data[column] - data[column].min()) / (data[column].max() - data[column].min())
        elif data[column].dtype == object:
            # Perform label encoding for categorical columns
            le = LabelEncoder()
            encoded_col = le.fit_transform(data[column])
            data[column] = encoded_col

    return data

data = normalize(data)

# Define X (features) and y (target variable)
X = data.drop(columns=['JS_Price'])
y = data['JS_Price']

# Step 1: Calculate Pearson Correlation Coefficients (PCC)
pcc_ranking = X.corrwith(y).abs().sort_values(ascending=False)

# Step 2: Compute Mutual Information Scores
mi_scores = mutual_info_regression(X, y)
mi_ranking = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Step 3: Fit a Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X, y)
linear_regression_coefficients = pd.Series(linear_model.coef_, index=X.columns).abs().sort_values(ascending=False)

# Step 4: Train a Random Forest Model
rf_model = RandomForestRegressor(n_estimators=1000, random_state=84)
rf_model.fit(X, y)
rf_feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Normalize each ranking
min_max_scaler = lambda x: (x - x.min()) / (x.max() - x.min())

normalized_pcc = min_max_scaler(pcc_ranking)
normalized_mi = min_max_scaler(mi_ranking)
normalized_linear = min_max_scaler(linear_regression_coefficients)
normalized_rf = min_max_scaler(rf_feature_importances)

# Define your weights for each ranking method (you can adjust these weights)
weight_pcc = 0.2
weight_mi = 0.2
weight_linear = 0.3
weight_rf = 0.3

# Combine the normalized rankings with weights
combined_ranking = (
    weight_pcc * normalized_pcc +
    weight_mi * normalized_mi +
    weight_linear * normalized_linear +
    weight_rf * normalized_rf
)

# Create a DataFrame with the combined ranking
combined_ranking_df = pd.DataFrame({
    'Feature': combined_ranking.index,
    'Combined_Ranking': combined_ranking.values
})

# Sort the features by the combined ranking
sorted_features = combined_ranking_df.sort_values(by='Combined_Ranking', ascending=False)

Ranking_Features=sorted_features

Ranking_Features



Unnamed: 0,Feature,Combined_Ranking
13,Sell_Price,1.0
6,JS_BA,0.472456
7,LC_index,0.138212
2,CA_index,0.117569
4,HSP_index,0.114227
14,Subway_Counts,0.108873
18,YearMonth,0.107899
15,TC_index,0.100193
5,IR,0.094278
3,Crime_Rates,0.093869
