In [1]:
import pandas as pd
import os

In [2]:
if os.path.exists('data/processed_df.csv'):
    df = pd.read_csv('data/processed_df.csv')
    print('csv file successfully loaded!')
else:
    raise FileNotFoundError('File Not Found!')

csv file successfully loaded!


In [None]:
y = df['selling_price_scaled'].copy() # target output

In [4]:
#dropping target output from the dataset
x = df.drop(columns=['selling_price','selling_price_scaled','name']).copy()

In [8]:
# import libraries
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from sklearn.preprocessing import RobustScaler,PolynomialFeatures,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor


In [7]:
#splits the dataset into training and testing sets - helps in model evaluation 
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.20,random_state=42
)
x_train.to_parquet('data/x_train.parquet',index=False)
x_test.to_parquet('data/x_test.parquet',index=False)
y_train.to_frame('y_train').to_parquet('data/y_train.parquet')
y_train.to_frame('y_test').to_parquet('data/y_test.parquet')

In [None]:
numeric_cols = x_train.select_dtypes(include='number').columns
category_cols = x_train.select_dtypes(include='object').columns.astype(str)

# 1. pipeline for numerical data - performs standardization,imputation and polynomial feature engineering
num_preprocessor = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('scaler',RobustScaler()),
    ('poly',PolynomialFeatures(include_bias=False))
])

# 2. pipeline for categorical data - performs imputation and one hot encoding
cat_preprocessor = Pipeline(steps=[
    ('simpute',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# 3. column transformations - transforms individual columns based on specifications in pipelines 1 and 2
preprocessor = ColumnTransformer(transformers=[
    ('num',num_preprocessor,numeric_cols),
    ('cat',cat_preprocessor,category_cols)
])

#cross validation split - model evaluation technique 1
# - splits the training set into 5, trains on 4 sets and validates on 1 set, repeating the process 5 times
cv = KFold(n_splits=5,shuffle=True,random_state=42)

models_grid = {
    'linear_regression' : LinearRegression(n_jobs=-1),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2,3,4]
    },
    'lasso' : Lasso(random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__alpha' : [0.1,0.5,1.0,10,50],
        'classifier__max_iter' : [1000,1500,2000]
    },
    'ridge' : Ridge(random_state=42),
    'params' : {
        'preprocessor__num__poly__degree' : [1,2],
        'classifier__alpha' : [0.1,0.5,1.0,10,50],
        'classifier__max_iter' : [1000,1500,2000]
    },
    'decision_tree_regressor' : DecisionTreeRegressor()
}

# 4. pipeline for model - XGBRegressor
pipe_xgb = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',XGBRegressor(objective='reg:squarederror',verbosity=0,random_state=42))
])

# hyperparameter tuning - model evaluation technique 5
xgb_params_dist = {
    'preprocessor__num__poly__degree' : [1,2],
    'model__learning_rate' : [0.05,0.1,0.5],
    'model__n_estimators' :[80,100],
    'model__max_depth' : [3,6,8],
    'model__colsample_bytree' : [0.6,0.8,1.0],
    'model__subsample' : [0.6,0.8,1.0]
}

# gridsearchcv - uses all possible combinations in 
# hyperparameter tuning to train the model, chooses the best combination
gs_xgb = GridSearchCV(estimator=pipe_xgb,
                            param_grid=xgb_params_dist,
                            cv = cv,
                            n_jobs= -1,
                            return_train_score=True,
                            scoring='neg_mean_squared_error',
                            refit=True,
                            verbose=2)

# model training
print('Fitting GridSearchCV for xgboost (This may take a while)...')
gs_xgb.fit(x_train,y_train)
print('Best parameters: ',gs_xgb.best_params_) #returns the best parameters - (best combination)