# Model Building

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.api import OLS, add_constant
from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [3]:
# with open('../Data/after_feature_engineering.pkl', 'rb') as f:
#     df_train = pickle.load(f)
#     df_valid = pickle.load(f)
#     df_test = pickle.load(f)
#     df_train_encoded = pickle.load(f)
#     df_valid_encoded = pickle.load(f)
#     df_test_encoded = pickle.load(f)

In [3]:
with open('/content/drive/My Drive/Colab Notebooks/after_feature_engineering.pkl', 'rb') as f:
    df_train = pickle.load(f)
    df_valid = pickle.load(f)
    df_test = pickle.load(f)
    df_train_encoded = pickle.load(f)
    df_valid_encoded = pickle.load(f)
    df_test_encoded = pickle.load(f)

In [4]:
# with open('../Data/selected_features.pkl', 'rb') as f:
#     selected_features = pickle.load(f)

In [5]:
with open('/content/drive/My Drive/Colab Notebooks/selected_features.pkl', 'rb') as f:
    selected_features = pickle.load(f)

In [6]:
df_train.head()

Unnamed: 0,unit_id,obj_type,dim_m2,n_rooms,floor_no,floor_max,year_built,dist_centre,n_poi,dist_sch,...,price_z,src_month,loc_code,market_volatility,infrastructure_quality,neighborhood_crime_rate,popularity_index,green_space_ratio,estimated_maintenance_cost,global_economic_index
45366,f13df795472bee05,Unknown,59.39,3.0,3.0,4.0,1980.0,2.578,19.0,0.326,...,12.834406,2023-12,533f6886,358182.61,50.29,19.44,43.22,1.0,17.98,93.327971
86630,003ffcbd3294c3c9,0d6c4dfc,52.95,2.0,2.0,6.0,2023.0,8.413,13.0,0.387,...,13.570913,2023-12,693f303c,795187.23,1.13,77.42,31.27,0.999,24.08,94.280615
139489,f5d1fc8f4a154cd4,0d6c4dfc,24.55,2.0,2.0,4.0,1993.0,4.479,2.0,0.561,...,13.362486,2024-06,693f303c,599098.08,21.12,90.45,70.33,1.0,9.25,108.618716
24126,02376da3fe009bb0,Unknown,57.58,3.0,3.0,3.0,2017.0,13.367,0.0,1.195,...,14.072345,2024-05,693f303c,1503238.02,8.08,46.21,39.09,0.999,8.48,100.635935
151982,7b6bb74dcdff86a3,2a6d5c01,73.71,3.0,1.0,2.0,2016.0,9.365,0.0,2.159,...,13.705193,2023-10,693f303c,893331.75,9.38,42.41,59.41,0.999,24.94,93.385347


In [7]:
df_train_encoded.head()

Unnamed: 0,unit_id,dim_m2,n_rooms,floor_no,floor_max,year_built,dist_centre,n_poi,dist_sch,dist_clinic,...,green_space_ratio,estimated_maintenance_cost,global_economic_index,obj_type_0d6c4dfc,obj_type_2a6d5c01,obj_type_Unknown,build_mat_7f8c00f9,build_mat_Unknown,has_lift_no,has_lift_yes
45366,f13df795472bee05,59.39,3.0,3.0,4.0,1980.0,2.578,19.0,0.326,0.462,...,1.0,17.98,93.327971,0,0,1,0,1,1,0
86630,003ffcbd3294c3c9,52.95,2.0,2.0,6.0,2023.0,8.413,13.0,0.387,1.423,...,0.999,24.08,94.280615,1,0,0,0,1,0,1
139489,f5d1fc8f4a154cd4,24.55,2.0,2.0,4.0,1993.0,4.479,2.0,0.561,0.611,...,1.0,9.25,108.618716,1,0,0,0,1,1,0
24126,02376da3fe009bb0,57.58,3.0,3.0,3.0,2017.0,13.367,0.0,1.195,3.004,...,0.999,8.48,100.635935,0,0,1,0,1,0,1
151982,7b6bb74dcdff86a3,73.71,3.0,1.0,2.0,2016.0,9.365,0.0,2.159,4.349,...,0.999,24.94,93.385347,0,1,0,0,0,0,1


In [8]:
df_train[selected_features].head()

Unnamed: 0,market_volatility,dim_m2,n_rooms,estimated_maintenance_cost,year_built,n_poi,dist_centre,green_space_ratio,infrastructure_quality,obj_type,build_mat,has_park,has_balcony,has_lift,has_sec,has_store,src_month
45366,358182.61,59.39,3.0,17.98,1980.0,19.0,2.578,1.0,50.29,Unknown,Unknown,no,yes,no,no,yes,2023-12
86630,795187.23,52.95,2.0,24.08,2023.0,13.0,8.413,0.999,1.13,0d6c4dfc,Unknown,no,yes,yes,yes,no,2023-12
139489,599098.08,24.55,2.0,9.25,1993.0,2.0,4.479,1.0,21.12,0d6c4dfc,Unknown,no,yes,no,no,no,2024-06
24126,1503238.02,57.58,3.0,8.48,2017.0,0.0,13.367,0.999,8.08,Unknown,Unknown,yes,no,yes,yes,no,2024-05
151982,893331.75,73.71,3.0,24.94,2016.0,0.0,9.365,0.999,9.38,2a6d5c01,7ceffe3b,yes,yes,yes,no,yes,2023-10


In [9]:
df_train_encoded.select_dtypes(include = 'object').head()

Unnamed: 0,unit_id,has_park,has_balcony,has_sec,has_store,src_month,loc_code
45366,f13df795472bee05,no,yes,no,yes,2023-12,533f6886
86630,003ffcbd3294c3c9,no,yes,yes,no,2023-12,693f303c
139489,f5d1fc8f4a154cd4,no,yes,no,no,2024-06,693f303c
24126,02376da3fe009bb0,yes,no,yes,no,2024-05,693f303c
151982,7b6bb74dcdff86a3,yes,yes,no,yes,2023-10,693f303c


In [10]:
selected_features

['market_volatility',
 'dim_m2',
 'n_rooms',
 'estimated_maintenance_cost',
 'year_built',
 'n_poi',
 'dist_centre',
 'green_space_ratio',
 'infrastructure_quality',
 'obj_type',
 'build_mat',
 'has_park',
 'has_balcony',
 'has_lift',
 'has_sec',
 'has_store',
 'src_month']

In [11]:
df_train_encoded.columns.tolist()

['unit_id',
 'dim_m2',
 'n_rooms',
 'floor_no',
 'floor_max',
 'year_built',
 'dist_centre',
 'n_poi',
 'dist_sch',
 'dist_clinic',
 'dist_post',
 'dist_kind',
 'dist_rest',
 'dist_uni',
 'dist_pharma',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'loc_code',
 'market_volatility',
 'infrastructure_quality',
 'neighborhood_crime_rate',
 'popularity_index',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'global_economic_index',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [12]:
selected_features = ['dim_m2',
 'n_rooms',
 'year_built',
 'dist_centre',
 'n_poi',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'market_volatility',
 'infrastructure_quality',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [13]:
selected_features

['dim_m2',
 'n_rooms',
 'year_built',
 'dist_centre',
 'n_poi',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'market_volatility',
 'infrastructure_quality',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [14]:
df_train_encoded[selected_features].select_dtypes(include='object').head()

Unnamed: 0,has_park,has_balcony,has_sec,has_store,src_month
45366,no,yes,no,yes,2023-12
86630,no,yes,yes,no,2023-12
139489,no,yes,no,no,2024-06
24126,yes,no,yes,no,2024-05
151982,yes,yes,no,yes,2023-10


In [15]:
obj_cols = df_train_encoded[selected_features].select_dtypes(include='object').columns
obj_cols

Index(['has_park', 'has_balcony', 'has_sec', 'has_store', 'src_month'], dtype='object')

In [16]:
bool_cols = ['has_park', 'has_balcony', 'has_sec', 'has_store']

df_train_encoded[bool_cols] = df_train_encoded[bool_cols].replace({'yes': 1, 'no': 0})


  df_train_encoded[bool_cols] = df_train_encoded[bool_cols].replace({'yes': 1, 'no': 0})


In [17]:
df_train_encoded['src_month'] = pd.to_datetime(df_train_encoded['src_month'])
df_train_encoded['src_month'] = df_train_encoded['src_month'].map(lambda x: x.toordinal())

In [18]:
df_train_encoded[selected_features] = df_train_encoded[selected_features].apply(pd.to_numeric)

In [19]:
df_train_encoded[selected_features].head()

Unnamed: 0,dim_m2,n_rooms,year_built,dist_centre,n_poi,has_park,has_balcony,has_sec,has_store,price_z,...,infrastructure_quality,green_space_ratio,estimated_maintenance_cost,obj_type_0d6c4dfc,obj_type_2a6d5c01,obj_type_Unknown,build_mat_7f8c00f9,build_mat_Unknown,has_lift_no,has_lift_yes
45366,59.39,3.0,1980.0,2.578,19.0,0,1,0,1,12.834406,...,50.29,1.0,17.98,0,0,1,0,1,1,0
86630,52.95,2.0,2023.0,8.413,13.0,0,1,1,0,13.570913,...,1.13,0.999,24.08,1,0,0,0,1,0,1
139489,24.55,2.0,1993.0,4.479,2.0,0,1,0,0,13.362486,...,21.12,1.0,9.25,1,0,0,0,1,1,0
24126,57.58,3.0,2017.0,13.367,0.0,1,0,1,0,14.072345,...,8.08,0.999,8.48,0,0,1,0,1,0,1
151982,73.71,3.0,2016.0,9.365,0.0,1,1,0,1,13.705193,...,9.38,0.999,24.94,0,1,0,0,0,0,1


In [20]:
df_train_encoded[selected_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 93872 entries, 45366 to 15725
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   dim_m2                      93872 non-null  float64
 1   n_rooms                     93872 non-null  float64
 2   year_built                  93872 non-null  float64
 3   dist_centre                 93872 non-null  float64
 4   n_poi                       93872 non-null  float64
 5   has_park                    93872 non-null  int64  
 6   has_balcony                 93872 non-null  int64  
 7   has_sec                     93872 non-null  int64  
 8   has_store                   93872 non-null  int64  
 9   price_z                     93872 non-null  float64
 10  src_month                   93872 non-null  int64  
 11  market_volatility           93872 non-null  float64
 12  infrastructure_quality      93872 non-null  float64
 13  green_space_ratio           9387

In [21]:
X_train = df_train_encoded[selected_features].drop('price_z', axis=1)

y_train = df_train_encoded['price_z']

In [22]:
model_ols = sm.OLS(y_train, X_train).fit()
print(model_ols.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.088e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        21:54:55   Log-Likelihood:                          19378.
No. Observations:               93872   AIC:                                 -3.871e+04
Df Residuals:                   93851   BIC:                                 -3.852e+04
Df Model:                          21                                                  
Covariance Type:            nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------

In [24]:
import sys
sys.path.append('../src')  # adjust path if needed

from backward_elimination import backward_elimination_aic_bic

In [25]:
# define a function that will perform backward elimination
# using AIC or BIS as the criterion

def backward_elimination_aic_bic(X, y, criterion = 'AIC'):
    '''
    Perform backward elimination using AIC or BIC as the criterion.

    Parameters:
        X (DataFrame): Feature matrix with a constant column.
        y (Series): Target variable.
        criterion (str): 'AIC' or 'BIC' (default: 'AIC').

    Returns:
        statsmodels OLS fitted model with selected features.
    '''
    model = sm.OLS(y, X).fit()

    while len(X.columns) > 1: # at least one predictor + constant
        best_criterion = model.aic if criterion == 'AIC' else model.bic

        # compute AIC/BIC for models without each predictor
        aic_bic_values = {}
        for col in X.columns[1:]:  # skip intercept
            X_new = X.drop(columns = [col])
            new_model = sm.OLS(y, X_new).fit()
            aic_bic_values[col] = new_model.aic if criterion == 'AIC' else new_model.bic

        # find the feature whose removal lowers AIC/BIC the most
        worst_feature = min(aic_bic_values, key = aic_bic_values.get)
        worst_aic_bic = aic_bic_values[worst_feature]

        # stop if no improvement
        if worst_aic_bic >= best_criterion:
            break

        # remove the feature and update the model
        X = X.drop(columns = [worst_feature])
        model = sm.OLS(y, X).fit()

    return model

In [26]:
model_ols_aic = backward_elimination_aic_bic(X_train, y_train, criterion = 'AIC')

print(model_ols_aic.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.308e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        21:55:26   Log-Likelihood:                          19378.
No. Observations:               93872   AIC:                                 -3.872e+04
Df Residuals:                   93853   BIC:                                 -3.854e+04
Df Model:                          19                                                  
Covariance Type:            nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------

In [27]:
model_ols_bic = backward_elimination_aic_bic(X_train, y_train, criterion = 'BIC')

print(model_ols_bic.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.740e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        21:55:55   Log-Likelihood:                          19368.
No. Observations:               93872   AIC:                                 -3.870e+04
Df Residuals:                   93856   BIC:                                 -3.855e+04
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [28]:
model_linear = LinearRegression()

model_linear.fit(X_train, y_train)

---
For OLS we have:

model_ols\
model_ols_aic\
model_ols_bic\
model_linear

---

In [29]:
cv5 = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 123
)

polynomial_pipeline = Pipeline(
    [
        ('generator', PolynomialFeatures()),
        ('model', LinearRegression())
    ]
)

# degrees = list(range(1, 5))
degrees = [1, 2]

# we need to define a grid of hyperparameters for the PolynomailFeatures() transformer
# it has to be a dictionary with the name of the step and the name of the hyperparameter
# as defined in the pipeline and algorithm documentation

# ex: here we use the name "generator__degree" because
# our pipeline step is named "generator" and the hyperparameter of the
# PolynomialFeatures() transformer is named "degree"
polynomial_grid = {'generator__degree': degrees}

# define a GridSearchCV object to find the optimal degree
polynomial_grid_search = GridSearchCV(
    polynomial_pipeline, # model or pipeline to tune
    param_grid = polynomial_grid, # dictionary with hyperparameters
    cv = cv5, # cross-validation
    # python assumes the higher the better, hence we use negative RMSE
    scoring = 'neg_root_mean_squared_error', # evaluation metric
    n_jobs = -1
)

# fit the model
polynomial_grid_search.fit(X_train, y_train)

# adding "_best" to GridSearchCV result
model_polynomial_best = polynomial_grid_search.best_estimator_

# print the hyperparameters of the best selected model
print('Best degree:', polynomial_grid_search.best_params_)
print('Best -RMSE:', polynomial_grid_search.best_score_)

Best degree: {'generator__degree': 2}
Best -RMSE: -0.1323919921960286


In [27]:
# polynomial_grid_search.cv_results_

In [None]:
# cv_loo = LeaveOneOut()

# polynomial_random_search = RandomizedSearchCV(
#     polynomial_pipeline,
#     param_distributions = polynomial_grid,
#     cv = cv_loo,
#     n_iter = 2,
#     # Difference: MAPE metric
#     scoring = 'neg_mean_absolute_percentage_error',
#     n_jobs = -1
# )

# polynomial_random_search.fit(X_train, y_train)

# print('Best degree:', polynomial_random_search.best_params_)
# print('Best -MAPE:', polynomial_random_search.best_score_)

# model_polynomial_best2 = polynomial_random_search.best_estimator_

# Ridge

In [30]:
alphas = [0.01, 0.1, 1, 10, 100, 1000]

# as we already did preprocessing no need to define it
ridge_pipeline = Pipeline([
    ("model", Ridge())
])

# 5-Fold Cross-Validation
cv5 = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 123
)

param_grid = {"model__alpha": alphas}

# GridSearchCV
ridge_grid_search = GridSearchCV(
    ridge_pipeline,
    param_grid = param_grid,
    cv = cv5,
    scoring = "neg_root_mean_squared_error",
    n_jobs = -1
)

ridge_grid_search.fit(X_train, y_train)

model_ridge_best = ridge_grid_search.best_estimator_

print("Best alpha:", ridge_grid_search.best_params_)
print("Best -RMSE:", ridge_grid_search.best_score_)


Best alpha: {'model__alpha': 0.01}
Best -RMSE: -0.19660116158713822


  return f(*arrays, *other_args, **kwargs)


# LASSO

In [31]:
alphas = np.logspace(-3, 4, 50)

lasso_pipeline = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ]
)

cv5 = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 123
)


param_grid = {
    'model__alpha': alphas
}

lasso_grid_search = GridSearchCV(
    estimator = lasso_pipeline,
    param_grid = param_grid,
    scoring = 'neg_mean_absolute_error', # could be R2 too
    cv = cv5,
    n_jobs = -1
)

lasso_grid_search.fit(X_train, y_train)

model_lasso_best = lasso_grid_search.best_estimator_

print('Best alpha:', lasso_grid_search.best_params_)
print('Best (negative) MAE:', lasso_grid_search.best_score_)


Best alpha: {'model__alpha': np.float64(0.0026826957952797246)}
Best (negative) MAE: -0.1462577666789821


# Elastic Net

In [32]:
alphas = [0.01, 0.1, 1, 10]
l1_ratios = [0.1, 0.5, 0.9]

elastic_pipeline = Pipeline(
    [
        # ('scaler', StandardScaler()), # z-score scaling
        ('model', ElasticNet(max_iter=2000, tol=0.01))
    ]

)

param_grid = {
    'model__alpha': alphas,
    'model__l1_ratio': l1_ratios
}

elastic_grid_search = GridSearchCV(
    estimator = elastic_pipeline,
    param_grid = param_grid,
    scoring = 'neg_mean_absolute_error',
    cv = cv5,
    n_jobs = -1
)

elastic_grid_search.fit(X_train, y_train)

model_elastic_best = elastic_grid_search.best_estimator_

print('Best alpha:', elastic_grid_search.best_params_['model__alpha'])
print('Best l1_ratio:', elastic_grid_search.best_params_['model__l1_ratio'])
print('Best (negative) MAE:', elastic_grid_search.best_score_)

Best alpha: 0.01
Best l1_ratio: 0.1
Best (negative) MAE: -0.14615683365753543


# SVM - SVR

In [33]:
from sklearn.svm import SVC, SVR

In [37]:
# SVR pipeline
svr_pipeline = Pipeline([
    # ('scaler', StandardScaler()),
    ('model', SVR())
])

# # Hyperparameter grid
# param_grid = {
#     # model__C : 100 was computationally expensive so changing to 50
#     'model__C': [0.1, 1, 10, 50],
#     'model__epsilon': [0.01, 0.1, 0.5, 1],
#     'model__gamma': ['scale', 'auto'],  # or numeric values like 0.01, 0.1
#     'model__kernel': ['rbf']  # you can also try 'linear', 'poly', etc.
# }

# previous version was time consuming
param_grid = {
    'model__C': [0.1, 1, 10],
    'model__epsilon': [0.01, 0.1, 0.5],
    'model__gamma': ['scale'],
    'model__kernel': ['rbf']
}

# GridSearchCV
svr_grid_search = GridSearchCV(
    estimator=svr_pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=cv5,
    n_jobs=-1
)

# Fit
svr_grid_search.fit(X_train, y_train)

# Best model
model_svr_best = svr_grid_search.best_estimator_

print('Best C:', svr_grid_search.best_params_['model__C'])
print('Best epsilon:', svr_grid_search.best_params_['model__epsilon'])
print('Best gamma:', svr_grid_search.best_params_['model__gamma'])
print('Best (negative) MAE:', svr_grid_search.best_score_)

In [44]:
# SVR with GridSearchCV took soo much time

svr_pipeline = Pipeline([
    # ('scaler', StandardScaler()),
    ('model', SVR())
])

param_dist = {
    'model__C': [50],
    'model__epsilon': [0.01],
    'model__gamma': [0.01],  # keep only scale for speed
    'model__kernel': ['rbf']
}

svr_random_search = RandomizedSearchCV(
    estimator=svr_pipeline,
    param_distributions=param_dist,
    n_iter=5,  # try 10 random combinations
    scoring='neg_mean_absolute_error',
    cv=3,       # fewer folds for speed
    n_jobs=-1,
    random_state=123
)

svr_random_search.fit(X_train, y_train)

model_svr_best = svr_random_search.best_estimator_
print(svr_random_search.best_params_)
print(svr_random_search.best_score_)


In [45]:
svr_pipeline = Pipeline([
    # ('scaler', StandardScaler()),  # optional if X already scaled
    ('model', SVR(C=1.0, epsilon=0.1, kernel='rbf', gamma='scale', max_iter=2000))
])

# Fit the model
svr_pipeline.fit(X_train, y_train)



In [47]:
model_svr = SVR(
    C=1.0,
    epsilon=0.1,
    kernel='rbf',
    gamma='scale',
    max_iter=2000
)

# Fit the model
model_svr.fit(X_train, y_train)



In [49]:
with open('/content/drive/My Drive/Colab Notebooks/after_model_building.pkl', 'wb') as f:
    pickle.dump(model_ols, f)
    pickle.dump(model_ols_aic, f)
    pickle.dump(model_ols_bic, f)
    pickle.dump(model_linear, f)
    pickle.dump(model_polynomial_best, f)
    pickle.dump(model_ridge_best, f)
    pickle.dump(model_lasso_best, f)
    pickle.dump(model_elastic_best, f)
    pickle.dump(model_svr, f)
    pickle.dump(selected_features, f)

In [48]:
# for loading next ...

# with open('/content/drive/My Drive/Colab Notebooks/after_model_building.pkl', 'rb') as f:
#     model_ols = pickle.load(f)
#     model_ols_aic = pickle.load(f)
#     model_ols_bic = pickle.load(f)
#     model_linear = pickle.load(f)
#     model_polynomial_best = pickle.load(f)
#     model_ridge_best = pickle.load(f)
#     model_lasso_best = pickle.load(f)
#     model_elastic_best = pickle.load(f)
#     model_svr = pickle.load(f)
#     selected_features = pickle.load(f)
