# load data

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [57]:
test_id = test_data['Id']

# EDA

In [37]:
train_data['OverallQual'].kurt()

0.09629277835615113

In [None]:
# for cat: look for unique
# for num: look for skew (-1 to 1) and kurt (-1 to 1)
from pandas.api.types import is_numeric_dtype
def unistats(df):
    df_results = pd.DataFrame(columns = ['count','missing','unique','dtype','numeric','mode','mean','min','25%','50%','75%','max','std','skew','kurt'])
    for col in df.columns:
        if is_numeric_dtype(df[col]):
            df_results.loc[col] = [df[col].count(),df[col].isnull().sum(),len(df[col].unique()),df[col].dtype,1,df[col].mode()[0],df[col].mean(),df[col].min(),df[col].quantile(.25),df[col].quantile(.5),df[col].quantile(.75),df[col].max(),df[col].std(),df[col].skew(),df[col].kurt()]
        else:
            df_results.loc[col] = [df[col].count(),df[col].isnull().sum(),len(df[col].unique()),df[col].dtype,0,df[col].mode()[0],'-','-','-','-','-','-','-','-','-']
        
    return df_results.sort_values(by=['numeric','skew','kurt','unique'],ascending=False)

In [54]:
pd.set_option('display.max_rows', None)    # Show all rows
pd.set_option('display.max_columns', None) # Show all columns

summary_stats = unistats(train_data)
summary_stats

Unnamed: 0,count,missing,unique,dtype,numeric,mode,mean,min,25%,50%,75%,max,std,skew,kurt
MiscVal,1460,0,21,int64,1,0,43.489041,0,0.0,0.0,0.0,15500,496.123024,24.476794,701.003342
PoolArea,1460,0,8,int64,1,0,2.758904,0,0.0,0.0,0.0,738,40.177307,14.828374,223.268499
LotArea,1460,0,1073,int64,1,7200,10516.828082,1300,7553.5,9478.5,11601.5,215245,9981.264932,12.207688,203.243271
3SsnPorch,1460,0,20,int64,1,0,3.409589,0,0.0,0.0,0.0,508,29.317331,10.304342,123.662379
LowQualFinSF,1460,0,24,int64,1,0,5.844521,0,0.0,0.0,0.0,572,48.623081,9.011341,83.234817
KitchenAbvGr,1460,0,4,int64,1,1,1.046575,0,1.0,1.0,1.0,3,0.220338,4.488397,21.532404
BsmtFinSF2,1460,0,144,int64,1,0,46.549315,0,0.0,0.0,0.0,1474,161.319273,4.255261,20.113338
ScreenPorch,1460,0,76,int64,1,0,15.060959,0,0.0,0.0,0.0,480,55.757415,4.122214,18.439068
BsmtHalfBath,1460,0,3,int64,1,0,0.057534,0,0.0,0.0,0.0,2,0.238753,4.103403,16.396642
EnclosedPorch,1460,0,120,int64,1,0,21.95411,0,0.0,0.0,0.0,552,61.119149,3.089872,10.430766


In [60]:
# calc reasonr corr for numeric and numeric and anova for numeric and 
from scipy import stats
from pandas.api.types import is_numeric_dtype
import pandas as pd

def bistats(df, label):
    df_results = pd.DataFrame(columns=['method', 'sign', 'effect', 'p-value'])
    
    for col in df.columns:
        if col == label:
            continue  # skip target itself
        if df[col].isnull().sum() > 0:
            continue  # skip cols with nulls

        # Numeric vs numeric → Pearson correlation
        if is_numeric_dtype(df[col]) and is_numeric_dtype(df[label]):
            r, p = stats.pearsonr(df[col], df[label])
            sign = '+' if r > 0 else '-'
            df_results.loc[col] = ['pearsonr', sign, abs(r), p]

        # Categorical vs numeric → ANOVA
        elif not is_numeric_dtype(df[col]) and is_numeric_dtype(df[label]):
            groups = [df[label][df[col] == cat] for cat in df[col].unique()]
            f_stat, p = stats.f_oneway(*groups)
            sign = 'n/a'  # no direction for ANOVA
            df_results.loc[col] = ['anova', sign, f_stat, p]

        # Numeric vs categorical → ANOVA
        elif is_numeric_dtype(df[col]) and not is_numeric_dtype(df[label]):
            groups = [df[col][df[label] == cat] for cat in df[label].unique()]
            f_stat, p = stats.f_oneway(*groups)
            sign = 'n/a'
            df_results.loc[col] = ['anova', sign, f_stat, p]

        else:
            df_results.loc[col] = ['n/a', 'n/a', 'n/a', 'n/a']

    return df_results.sort_values(by=['method','effect'],ascending=False)

bistats_summary = bistats(train_data,'SalePrice')
bistats_summary

Unnamed: 0,method,sign,effect,p-value
OverallQual,pearsonr,+,0.790982,2.185675e-313
GrLivArea,pearsonr,+,0.708624,4.518034e-223
GarageCars,pearsonr,+,0.640409,2.498644e-169
GarageArea,pearsonr,+,0.623431,5.265038e-158
TotalBsmtSF,pearsonr,+,0.613581,9.484229e-152
1stFlrSF,pearsonr,+,0.605852,5.394711e-147
FullBath,pearsonr,+,0.560664,1.23647e-121
TotRmsAbvGrd,pearsonr,+,0.533723,2.772281e-108
YearBuilt,pearsonr,+,0.522897,2.990229e-103
YearRemodAdd,pearsonr,+,0.507101,3.164948e-96


# Preproccess Data

In [9]:
# profile = ProfileReport(train_data, title="YData Profiling Report")

In [10]:
# profile

In [36]:
print(len(train_data.columns))
print(len(train_data))
print(len(test_data))

81
1460
1459


In [5]:
missing_columns = train_data.isna().sum()[train_data.isna().sum() > 0].index.to_list()
print(train_data.isna().sum()[train_data.isna().sum() > 0])

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [63]:
# remove cols that has alot of missing values 
cols_to_be_removed = train_data.isna().sum()[train_data.isna().sum() > 800].index.to_list()
print(cols_to_be_removed)
train_data = train_data.drop(cols_to_be_removed,axis=1)
test_data = test_data.drop(cols_to_be_removed,axis=1)

['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']


In [64]:
# impute numberic cols with mean and cat cols with mode
numeric_df = train_data.select_dtypes(include='number')
cat_df = train_data.select_dtypes(include=['category','object'])
numeric_df_test = test_data.select_dtypes(include='number')
cat_df_test = test_data.select_dtypes(include=['category','object'])
mean = numeric_df.mean()
mode = cat_df.mode().iloc[0]
numeric_df = numeric_df.fillna(mean)
cat_df = cat_df.fillna(mode)
numeric_df_test = numeric_df_test.fillna(mean)
cat_df_test = cat_df_test.fillna(mode)
cat_df

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,...,TA,Typ,TA,Attchd,RFn,TA,TA,Y,WD,Normal
1456,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,...,TA,Min1,TA,Attchd,Unf,TA,TA,Y,WD,Normal
1457,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,...,Gd,Typ,Gd,Attchd,RFn,TA,TA,Y,WD,Normal
1458,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,Gd,Typ,Gd,Attchd,Unf,TA,TA,Y,WD,Normal


In [65]:
# convert cat to numeric
# use label encoding because i will use tree-based models
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in cat_df.columns:
    cat_df[col] = labelencoder.fit_transform(cat_df[col])
    cat_df_test[col] = labelencoder.transform(cat_df_test[col])

train_data = pd.concat([numeric_df,cat_df],axis=1)
test_data = pd.concat([numeric_df_test,cat_df_test],axis=1)
train_data


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,2,6,2,1,1,4,4,2,8,4
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,3,6,4,1,1,4,4,2,8,4
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,2,6,4,1,1,4,4,2,8,4
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,2,6,2,5,2,4,4,2,8,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,2,6,4,1,1,4,4,2,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,3,6,4,1,1,4,4,2,8,4
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,3,2,4,1,2,4,4,2,8,4
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,2,6,2,1,1,4,4,2,8,4
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,2,6,2,1,2,4,4,2,8,4


In [66]:
print(train_data['3SsnPorch'].to_list().count(0))
print(train_data['ScreenPorch'].to_list().count(0))
print(train_data['PoolArea'].to_list().count(0))
print(train_data['MiscVal'].to_list().count(0))


1436
1344
1453
1408


In [67]:
# remove non-changing cols
cols_to_be_removed =  ['3SsnPorch','ScreenPorch','PoolArea','MiscVal','Utilities','Street','Condition2','LandSlope','Id']
train_data = train_data.drop(cols_to_be_removed,axis=1)
test_data = test_data.drop(cols_to_be_removed,axis=1)

# build the model

In [49]:
X_train = train_data.drop(['SalePrice'],axis=1)
y_train = train_data['SalePrice']
# X_test = test_data.drop(['SalePrice'],axis=1)
# y_test = test_data['SalePrice']

In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
random_forest_regressor = RandomForestRegressor(n_estimators=15000,random_state=42)   

In [51]:
from scipy.stats import randint,uniform
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2']
}
random_search = RandomizedSearchCV(estimator=random_forest_regressor,param_distributions=param_dist,cv=5,n_jobs=-1,n_iter=100,verbose=1,scoring='neg_mean_squared_error',return_train_score=True)
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(random_search.cv_results_)

In [None]:
results['mean_test_score']

0    -9.606525e+08
1    -9.205399e+08
2    -8.749423e+08
3    -8.903042e+08
4    -8.813408e+08
          ...     
95   -9.900101e+08
96   -9.275123e+08
97   -9.574696e+08
98   -9.273958e+08
99   -9.373880e+08
Name: mean_test_score, Length: 100, dtype: float64

In [None]:
random_forest_regressor.fit(X_train,y_train)
rf_pred = random_forest_regressor.predict(X_test)
print(root_mean_squared_error(y_test,rf_pred))

KeyboardInterrupt: 

In [None]:
pred = random_search.best_estimator_.predict(X_test)
print(root_mean_squared_error(y_test,pred))

10412.3055115014


# gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train,y_train)
# y_pred = gbr.predict(X_test)
# print(root_mean_squared_error(y_test,y_pred))

15041.309312278318


In [52]:
from scipy.stats import randint,uniform

param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'subsample': uniform(0.7, 0.3),  # Range: 0.7 to 1.0
    'max_features': ['auto', 'sqrt', 'log2']
}


random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2,
    random_state=42,
    return_train_score=True
)
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


80 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\PC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\PC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "c:\Users\PC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\PC\AppData\Local\Programs\Python\Python311\Lib\site-packag

0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_distributions,"{'learning_rate': <scipy.stats....0012E71ED2310>, 'max_depth': <scipy.stats....0012E72044F50>, 'max_features': ['auto', 'sqrt', ...], 'min_samples_leaf': <scipy.stats....0012E720440D0>, ...}"
,n_iter,50
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,loss,'squared_error'
,learning_rate,0.11128455142108838
,n_estimators,195
,subsample,0.8674306006052023
,criterion,'friedman_mse'
,min_samples_split,9
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_depth,9
,min_impurity_decrease,0.0


In [61]:
len(test_data)

1459

In [68]:
pred = random_search.best_estimator_.predict(test_data)
# print(root_mean_squared_error(y_test,pred))

In [69]:
len(pred)

1459

In [70]:
pd.DataFrame({"Id":test_id,"SalePrice":pred}).to_csv("predictions.csv", index=False)