In [1]:
import pandas as pd
import os

In [2]:
if os.path.exists('data/processed_df.csv'):
    df = pd.read_csv('data/processed_df.csv')
    print('csv file successfully loaded!')
else:
    raise FileNotFoundError('File Not Found!')

csv file successfully loaded!


In [3]:
y = df['selling_price_scaled'].copy() # target output

In [4]:
#dropping target output from the dataset
x = df.drop(columns=['selling_price','selling_price_scaled','name']).copy()

In [5]:
# import libraries
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor


In [6]:
#splits the dataset into training and testing sets - helps in model evaluation 
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.20,random_state=42
)
x_train.to_parquet('data/x_train.parquet',index=False)
x_test.to_parquet('data/x_test.parquet',index=False)
y_train.to_frame('y_train').to_parquet('data/y_train.parquet')
y_test.to_frame('y_test').to_parquet('data/y_test.parquet')

In [None]:
numeric_cols = x_train.select_dtypes(include='number').columns
category_cols = x_train.select_dtypes(include='object').columns.astype(str)

# 1. pipeline for numerical data - performs standardization,imputation and polynomial feature engineering
num_preprocessor = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('scaler',RobustScaler()),
    ('poly',PolynomialFeatures(include_bias=False))
])

# 2. pipeline for categorical data - performs imputation and one hot encoding
cat_preprocessor = Pipeline(steps=[
    ('simpute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# 3. column transformations - transforms individual columns based on specifications in pipelines 1 and 2
preprocessor = ColumnTransformer(transformers=[
    ('num',num_preprocessor,numeric_cols),
    ('cat',cat_preprocessor,category_cols)
])

#cross validation split 
# - splits the training set into 5, trains on 4 sets and validates on 1 set, repeating the process 5 times
cv = KFold(n_splits=5,shuffle=True,random_state=42)

# hyperparameter tuning 
models_grid = {
    'xgb_regressor' : {'model' : XGBRegressor(objective='reg:squarederror',random_state=42),
    'params' : {
    'classifier__learning_rate' : [0.03,0.05,0.08],
    'classifier__n_estimators' :[80,100,120],
    'classifier__max_depth' : [6,8,10],
    'classifier__min_samples_split' : [3,4,7],
    'classifier__reg_lambda' : [0.2,0.4,0.6,0.8]
}},
    'Linear_regression' : {'model' : LinearRegression(n_jobs=-1),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2,3,4]
        }
    },

    'Lasso' : {'model' : Lasso(random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__alpha' : [0.1,0.5,1.0,10,50],
        'classifier__max_iter' : [1000,1500,2000]
    }},

    'Ridge': {'model' : Ridge(random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__alpha' : [0.1,0.5,1.0,10,50],
        'classifier__max_iter' : [1000,1500,2000]
    }},

    'decision_tree' : {'model' : DecisionTreeRegressor(random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__max_depth' : [4,6,8,None],
        'classifier__min_samples_split' : [2,4,8]
    }},

    'random_forest' : {'model' : RandomForestRegressor(n_jobs=-1,random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__max_depth' : [4,6,8,None],
        'classifier__min_samples_split' : [2,4,8]
    }}
}

best_score = -float('inf')
best_model_name = None
best_estimator = None

result = {}
# 4. pipeline for model
for name,models in models_grid.items():
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('classifier',models['model'])
    ])

    # gridsearchcv - uses all possible combinations in 
    # hyperparameter tuning to train the model, chooses the best combination
    model = GridSearchCV(estimator=pipe,
                            param_grid=models['params'],
                            cv = cv,
                            n_jobs= -1,
                            return_train_score=True,
                            scoring='neg_mean_squared_error',
                            refit=True,
                            verbose=2)

    # model training
    print(f'Fitting GridSearchCV for {name} (This may take a while)...')
    model.fit(x_train,y_train)
    print('\n','-'*50,'\n')

    result[name] = {
        'best_score' : model.best_score_,
        'best_params' : model.best_params_
    }
    if model.best_score_ > best_score:
        best_score = model.best_score_
        best_model_name = name
        best_estimator = model.best_estimator_

    import joblib
    joblib.dump(model.best_estimator_,f'models/{name}_best_model.pkl')

    import json
    with open('models/results_summary','w') as file:
        json.dump(result,file,indent=4)

