In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv("../../extracts/raw/train.csv").set_index('Id')
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# Preprocessing

In [3]:
numerical_continous_cols = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                            '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF',
                            'OpenPorchSF', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

target_column =['SalePrice']

numerical_discrete_cols = ['MSSubClass','OverallQual','OverallCond','YearBuilt','YearRemodAdd',
                           'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd',
                           'Fireplaces','GarageYrBlt','GarageCars','EnclosedPorch','MoSold','YrSold']

obj_cols = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
            'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 
            'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 
            'SaleType', 'SaleCondition']
len(numerical_continous_cols) + len(numerical_discrete_cols) + len(obj_cols) + len(target_column)

80

## Imputer

In [4]:
import warnings
warnings.filterwarnings('ignore')

for col in obj_cols:
    data[col].fillna('None', inplace=True)
    
for col in numerical_continous_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

    
for col in numerical_discrete_cols:
    data[col].fillna(data[col].median(), inplace=True)
    
print("Done")

Done


# Label Encoding - categorical column

In [5]:
label_encoders = {}

for col in obj_cols:
    label_encoder = LabelEncoder()
    data[col] = label_encoder.fit_transform(data[col])
    label_encoders[col] = label_encoder  

data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,3,65.0,8450,1,1,3,3,0,4,...,0,3,4,1,0,2,2008,8,4,208500
2,20,3,80.0,9600,1,1,3,3,0,2,...,0,3,4,1,0,5,2007,8,4,181500
3,60,3,68.0,11250,1,1,0,3,0,4,...,0,3,4,1,0,9,2008,8,4,223500
4,70,3,60.0,9550,1,1,0,3,0,0,...,0,3,4,1,0,2,2006,8,0,140000
5,60,3,84.0,14260,1,1,0,3,0,2,...,0,3,4,1,0,12,2008,8,4,250000


## Train Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), 
                                                    data['SalePrice'], 
                                                    test_size=0.2, shuffle=True,
                                                    random_state=42)

# Standard Scaler

In [7]:
scaler = StandardScaler()

X_train[numerical_continous_cols] = scaler.fit_transform(X_train[numerical_continous_cols])
X_test[numerical_continous_cols] = scaler.transform(X_test[numerical_continous_cols])

X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
255,20,3,0.069172,-0.212896,1,1,3,3,0,4,...,-0.275838,-0.070993,3,4,1,-0.09274,6,2010,8,4
1067,60,3,-0.413046,-0.265245,1,1,0,3,0,4,...,-0.275838,-0.070993,3,4,1,-0.09274,5,2009,8,4
639,30,3,-0.062342,-0.177841,1,1,3,3,0,4,...,-0.275838,-0.070993,3,2,1,-0.09274,5,2008,8,4
800,50,3,-0.369208,-0.324474,1,1,3,3,0,0,...,-0.275838,-0.070993,3,2,1,-0.09274,6,2007,8,4
381,50,3,-0.807588,-0.529035,1,2,3,3,0,4,...,-0.275838,-0.070993,3,4,1,-0.09274,5,2010,8,4


# Modelling - baselines

In [8]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import pickle


models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regression': SVR(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(verbose=0),
    'CatBoost': CatBoostRegressor(silent=True),
}

results_list = []

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

for model_name, model in models.items():
    model.fit(X_train, y_train)

    # with open(f"{model_name.replace(' ', '_')}.pkl", 'wb') as model_file:
    #     pickle.dump(model, model_file)

    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  
    mae = mean_absolute_error(y_test, y_pred)
    n = len(y_test)
    p = X_test.shape[1]  

    r2_adj = adjusted_r2(r2, n, p)

    results_list.append({
        'Model': model_name,
        'R^2': r2,
        'Adjusted R^2': r2_adj,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae
    })

results_df = pd.DataFrame(results_list)

results_df

Unnamed: 0,Model,R^2,Adjusted R^2,MSE,RMSE,MAE
0,Linear Regression,0.843204,0.784776,1202673000.0,34679.579288,21774.167884
1,Ridge Regression,0.844981,0.787215,1189045000.0,34482.532487,21668.374557
2,Lasso Regression,0.843229,0.784809,1202488000.0,34676.906112,21770.307325
3,Random Forest,0.889881,0.848846,844651100.0,29062.881738,17567.274007
4,Gradient Boosting,0.890246,0.849346,841852800.0,29014.698981,17554.64622
5,Support Vector Regression,-0.02497,-0.406916,7861853000.0,88667.092353,59568.142841
6,XGBoost,0.911803,0.878937,676500100.0,26009.614578,16987.127047
7,LightGBM,0.891904,0.851622,829134900.0,28794.702482,16511.589169
8,CatBoost,0.905084,0.869714,728037800.0,26982.175617,16233.69269
