In [3]:
import itertools
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import variance_inflation_factor

data = pd.read_csv("../../clustered_data_v2.csv")

filtered_data = data[data["cluster_all_data"] == 1]

# Selecting features and target variable
features = filtered_data.drop(['price', 'outliers_ecod',
       'cluster_all_data', 'cluster_no_outlier', ], axis=1)
target = filtered_data['price']

# Encode categorical features
features = pd.get_dummies(features, drop_first=True)

# Temporarily include 'price' for correlation calculation
features_with_price = features.copy()
features_with_price['price'] = target

# Calculate correlation matrix
correlation_matrix = features_with_price.corr().abs()

# Get features with high correlation with price
price_corr = correlation_matrix['price'].sort_values(ascending=False)
high_corr_features = price_corr.index[1:11]  # Top 10 features excluding 'price'

# Function to check if features have similar names
def has_similar_names(features):
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            if features[i].split('_')[0] == features[j].split('_')[0]:
                return True
    return False

# Function to evaluate model
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'MSE': mse, 'MAE': mae, 'R2': r2}

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42)
}

# Define parameter grids for hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Store the results
results = []

# Generate all 3-feature combinations from the high correlation features
feature_combinations = list(itertools.combinations(high_corr_features, 3))

# Train and evaluate models for each combination
for combination in feature_combinations:
    selected_features = list(combination)
    if has_similar_names(selected_features):
        continue
    X_train, X_test, y_train, y_test = train_test_split(features.loc[:, selected_features], target, test_size=0.2, random_state=42)
    
    # Adding polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    combination_results = {'features': selected_features}
    
    for name, model in models.items():
        if name == 'Linear Regression':
            model.fit(X_train_poly, y_train)
            y_pred = model.predict(X_test_poly)
        elif name == 'Random Forest Regressor':
            grid_search_rf = GridSearchCV(model, param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
            grid_search_rf.fit(X_train, y_train)
            best_rf = grid_search_rf.best_estimator_
            y_pred = best_rf.predict(X_test)
            combination_results['Random Forest Best Params'] = grid_search_rf.best_params_
        elif name == 'XGBoost Regressor':
            grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=3, scoring='r2', n_jobs=-1)
            grid_search_xgb.fit(X_train, y_train)
            best_xgb = grid_search_xgb.best_estimator_
            y_pred = best_xgb.predict(X_test)
            combination_results['XGBoost Best Params'] = grid_search_xgb.best_params_
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
        evaluation = evaluate_model(y_test, y_pred)
        combination_results[name] = evaluation
    
    results.append(combination_results)

# Flatten the results for easier analysis
flattened_results = []
for result in results:
    features = result['features']
    rf_params = result.get('Random Forest Best Params', None)
    xgb_params = result.get('XGBoost Best Params', None)
    for model_name, metrics in result.items():
        if model_name not in ['features', 'Random Forest Best Params', 'XGBoost Best Params']:
            flattened_results.append({
                'features': features,
                'model': model_name,
                'MSE': metrics['MSE'],
                'MAE': metrics['MAE'],
                'R2': metrics['R2'],
                'RF Params': rf_params,
                'XGB Params': xgb_params
            })

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(flattened_results)

# Display the results
results_df

48 fits failed out of a total of 96.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kduru\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kduru\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\kduru\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\kduru\AppData\Local\Programs\Python\Python312\Lib\

Unnamed: 0,features,model,MSE,MAE,R2,RF Params,XGB Params
0,"[lat, density, commute_time]",Linear Regression,1.308515e+10,83866.800620,0.465337,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
1,"[lat, density, commute_time]",Random Forest Regressor,9.355249e+09,67894.773385,0.617742,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
2,"[lat, density, commute_time]",Gradient Boosting Regressor,9.514753e+09,67890.455575,0.611224,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
3,"[lat, density, commute_time]",XGBoost Regressor,9.409413e+09,66983.385536,0.615528,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
4,"[lat, density, nearest_station_distance_km]",Linear Regression,1.456774e+10,85147.897767,0.404758,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
...,...,...,...,...,...,...,...
387,"[distance_to_point_km, yr_built, lot_size_cate...",XGBoost Regressor,1.638684e+10,93924.070710,0.330429,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
388,"[grade_living_category, yr_built, lot_size_cat...",Linear Regression,1.804040e+10,103814.393849,0.262864,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
389,"[grade_living_category, yr_built, lot_size_cat...",Random Forest Regressor,1.745558e+10,99621.774091,0.286760,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
390,"[grade_living_category, yr_built, lot_size_cat...",Gradient Boosting Regressor,1.721577e+10,98604.274379,0.296558,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."


In [5]:
high_r2 = results_df[results_df['R2'] > 0.70]
high_r2.sort_values(by='R2', ascending=False)

Unnamed: 0,features,model,MSE,MAE,R2,RF Params,XGB Params
11,"[lat, density, grade_living]",XGBoost Regressor,6369924000.0,57833.612655,0.739723,"{'max_depth': 20, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
15,"[lat, density, grade_living_normalized]",XGBoost Regressor,6369924000.0,57833.612655,0.739723,"{'max_depth': 20, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 1.0, 'learning_rate': 0.1..."
39,"[lat, commute_time, grade_living]",XGBoost Regressor,6535286000.0,58027.90625,0.732966,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
43,"[lat, commute_time, grade_living_normalized]",XGBoost Regressor,6535286000.0,58027.90625,0.732966,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
67,"[lat, nearest_station_distance_km, grade_livin...",XGBoost Regressor,6574923000.0,58247.208015,0.731347,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
63,"[lat, nearest_station_distance_km, grade_living]",XGBoost Regressor,6574923000.0,58247.208015,0.731347,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
98,"[lat, grade_living_normalized, distance_to_poi...",Gradient Boosting Regressor,6651570000.0,58725.739605,0.728215,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
86,"[lat, grade_living, distance_to_point_km]",Gradient Boosting Regressor,6655305000.0,58763.824344,0.728062,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
42,"[lat, commute_time, grade_living_normalized]",Gradient Boosting Regressor,6680064000.0,58901.845082,0.727051,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
38,"[lat, commute_time, grade_living]",Gradient Boosting Regressor,6706685000.0,58995.159262,0.725963,"{'max_depth': 10, 'max_features': 'sqrt', 'min...","{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
