In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
data_final = pd.read_csv("houses_full.csv")

# Analysis

## Columns

Need to look at the columns which couldn't be expressed with Numbers

In [3]:
data_final = data_final.set_index("Id")

In [4]:
data_final.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 80, dtype: int64

See if there any redundant Columns through their NaN values

In [5]:
data_final.isna().sum().sort_values(ascending=False)/len(data_final)*100

PoolQC           99.520548
MiscFeature      96.301370
Alley            93.767123
Fence            80.753425
MasVnrType       59.726027
                   ...    
HeatingQC         0.000000
Functional        0.000000
PavedDrive        0.000000
SaleType          0.000000
SaleCondition     0.000000
Length: 80, dtype: float64

In [6]:
data_final.loc[data_final["Fence"].isna(), "Fence"] = "No Fence"

In [7]:
data_final["LotFrontage"].isna().sum()

np.int64(259)

In [8]:
# X and y creation
x = data_final.drop(columns=["PoolQC", "MiscFeature", "Alley", "FireplaceQu", "RoofMatl", "Exterior1st", 
                             "Exterior2nd", "SaleType", "SaleCondition", "Utilities"])
y = x.pop("Expensive")

In [9]:
x.loc[x["Street"] == "Pave", "Street"] = 1
x.loc[x["Street"] == "Grvl", "Street"] = 0

x.loc[x["CentralAir"] == "Y", "CentralAir"] = 1
x.loc[x["CentralAir"] == "N", "CentralAir"] = 0

In [10]:
x.isna().sum().sort_values(ascending=False)/len(data_final)*100

MasVnrType     59.726027
LotFrontage    17.739726
GarageCond      5.547945
GarageType      5.547945
GarageQual      5.547945
                 ...    
HouseStyle      0.000000
Condition2      0.000000
Functional      0.000000
PavedDrive      0.000000
Fence           0.000000
Length: 69, dtype: float64

In [11]:
cat_features = x.select_dtypes(exclude="number").columns
cat_features

Index(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'MasVnrType', 'BsmtFinType2',
       'HeatingQC', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'Fence'],
      dtype='object')

## Preprocessor

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42, stratify=y)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1168, 69)
(292, 69)
(1168,)
(292,)


In [13]:
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [14]:
knn_imputer = KNNImputer(n_neighbors=2)
minmax = MinMaxScaler()
standard = StandardScaler()

num_pipe = make_pipeline(knn_imputer, minmax)

num_feat = x_train.select_dtypes(include=['int64', 'float64']).columns

make_column_transformer(num_pipe, num_feat)

0,1,2
,transformers,"[('knnimputer', ...), ('LotArea', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,2
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False


In [15]:
ordinal_imputer = SimpleImputer(strategy='constant', fill_value='missing')

feat_ordinal_dict = {
    # Considers "missing" as worse than poor=does not have effect
    "BsmtCond": ['missing', 'Po', 'Fa', 'TA', 'Gd'], #-> 0, 1, 2, 3, 4
    #otherwise OrdinalEncoder gives values based on order of appearance
    "BsmtExposure": ['missing', 'No', 'Mn', 'Av', 'Gd'],
    "ExterQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', "Ex"],
    "ExterCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', "Ex"],
    "BsmtQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', "Ex"],
    "BsmtFinType1": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', "ALQ", "GLQ"],
    "BsmtFinType2": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', "ALQ", "GLQ"],
    "KitchenQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', "Ex"],
    "LotShape": ["missing", "IR3", "IR2", "IR1", "Reg"],
    "HeatingQC": ["missing", "Po", "Fa", "TA", "Gd", "Ex"],
    "Fence": ["missing", "MnWw", "GdWo", "MnPrv", "GdPrv"],
    "Functional": ['missing', 'Sal', 'Sev', 'Maj2', 'Maj1', "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["missing", "Unf", "RFn", "Fin"],
    "GarageQual": ["missing", "Po", "Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["missing", "Po", "Fa", "TA", "Gd", "Ex"]
    }

feat_ordinal = sorted(feat_ordinal_dict.keys())
feat_ordinal_values = [feat_ordinal_dict[i] for i in feat_ordinal]

ord_encoder = OrdinalEncoder(categories=feat_ordinal_values,
                             dtype= np.int64,
                             handle_unknown="use_encoded_value",
                             unknown_value=-1
                             )

ordinal_pipe = make_pipeline(ordinal_imputer, ord_encoder)

make_column_transformer((ordinal_pipe, feat_ordinal))

0,1,2
,transformers,"[('pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['missing', 'Po', ...], ['missing', 'No', ...], ...]"
,dtype,<class 'numpy.int64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,


In [16]:
make_column_transformer((ordinal_pipe, feat_ordinal),
                        (num_pipe, num_feat))

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['missing', 'Po', ...], ['missing', 'No', ...], ...]"
,dtype,<class 'numpy.int64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,n_neighbors,2
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [17]:
ohe_imputer = SimpleImputer(strategy='most_frequent')

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

ohe_pipe = make_pipeline(ohe_imputer, ohe)

ohe_feat = list(set(x_train.columns) - set(num_feat) - set(feat_ordinal))

make_column_transformer((ohe_pipe, ohe_feat))

0,1,2
,transformers,"[('pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [18]:
list1 = list(num_feat)

In [19]:
preproc_pipeline = make_column_transformer(( num_pipe, list1), #num pipe
    (ordinal_pipe , feat_ordinal), #Ordinal pipe
    (ohe_pipe, ohe_feat) #ohe pipe
    )
preproc_pipeline

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,2
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['missing', 'Po', ...], ['missing', 'No', ...], ...]"
,dtype,<class 'numpy.int64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [20]:
preproc_pipeline.fit_transform(x_train, y_train)

Unnamed: 0_level_0,pipeline-1__LotArea,pipeline-1__LotFrontage,pipeline-1__TotalBsmtSF,pipeline-1__BedroomAbvGr,pipeline-1__Fireplaces,pipeline-1__PoolArea,pipeline-1__GarageCars,pipeline-1__WoodDeckSF,pipeline-1__ScreenPorch,pipeline-1__MSSubClass,...,pipeline-3__Condition2_PosN,pipeline-3__Condition2_RRAe,pipeline-3__Condition2_RRAn,pipeline-3__Condition2_RRNn,pipeline-3__Foundation_BrkTil,pipeline-3__Foundation_CBlock,pipeline-3__Foundation_PConc,pipeline-3__Foundation_Slab,pipeline-3__Foundation_Stone,pipeline-3__Foundation_Wood
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1212,0.050639,0.448630,0.091489,0.500,0.000000,0.0,0.50,0.379230,0.000000,0.176471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
554,0.034948,0.157534,0.000000,0.250,0.000000,0.0,0.50,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1194,0.014957,0.042808,0.200327,0.250,0.000000,0.0,0.50,0.000000,0.000000,0.588235,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
303,0.057978,0.332192,0.252209,0.375,0.333333,0.0,0.75,0.546091,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1069,0.012452,0.071918,0.154173,0.625,0.333333,0.0,0.50,0.849475,0.000000,0.823529,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.025801,0.183219,0.243535,0.125,0.000000,0.0,0.50,0.000000,0.291667,0.588235,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
808,0.093875,0.421233,0.216694,0.375,0.333333,0.0,0.50,0.000000,0.000000,0.294118,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
547,0.034761,0.167808,0.174304,0.375,0.333333,0.0,0.50,0.000000,0.000000,0.176471,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.042450,0.239726,0.181178,0.375,0.666667,0.0,0.50,0.274212,0.000000,0.235294,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [21]:
preproc_pipeline.get_params()

{'force_int_remainder_cols': 'deprecated',
 'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('pipeline-1',
   Pipeline(steps=[('knnimputer', KNNImputer(n_neighbors=2)),
                   ('minmaxscaler', MinMaxScaler())]),
   ['LotArea',
    'LotFrontage',
    'TotalBsmtSF',
    'BedroomAbvGr',
    'Fireplaces',
    'PoolArea',
    'GarageCars',
    'WoodDeckSF',
    'ScreenPorch',
    'MSSubClass',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'GarageYrBlt',
    'GarageArea',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'MiscVal',
    'MoSold',
    'YrSold']),
  ('pipeline-2',
   Pipeline(steps=[('simpleimputer',
            

## DecisionTree/DecisionForest

In [22]:
forest = RandomForestClassifier()

In [23]:
forest.estimator_params

('criterion',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'min_weight_fraction_leaf',
 'max_features',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'random_state',
 'ccp_alpha',
 'monotonic_cst')

In [24]:
tree = DecisionTreeClassifier(max_depth = 6, 
                              min_samples_leaf = 3)

forest = RandomForestClassifier(min_samples_leaf=2,
                                min_samples_split=9,
                                random_state=123)

pipe = make_pipeline(preproc_pipeline, forest)

pipe.fit(x_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('randomforestclassifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,2
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['missing', 'Po', ...], ['missing', 'No', ...], ...]"
,dtype,<class 'numpy.int64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,9
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## GridSearchCV

In [25]:
x_train

Unnamed: 0_level_0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,BsmtFinType2,HeatingQC,Electrical,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,Fence
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1212,12134,152.0,559,4,0,0,2,325,0,RL,...,Unf,Gd,SBrkr,Typ,Basment,RFn,TA,TA,Y,No Fence
554,8777,67.0,0,2,0,0,2,0,0,RL,...,,Ex,SBrkr,Typ,Detchd,Fin,TA,TA,N,MnPrv
1194,4500,,1224,2,0,0,2,0,0,RM,...,Unf,Ex,SBrkr,Typ,Attchd,Fin,TA,TA,Y,No Fence
303,13704,118.0,1541,3,1,0,3,468,0,RL,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,No Fence
1069,3964,42.0,942,5,1,0,2,728,0,RM,...,Unf,Gd,SBrkr,Maj1,Attchd,Fin,TA,TA,Y,GdPrv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,6820,,1488,1,0,0,2,0,140,RL,...,BLQ,TA,SBrkr,Typ,Attchd,RFn,TA,TA,Y,No Fence
808,21384,144.0,1324,3,1,0,2,0,0,RL,...,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,No Fence
547,8737,70.0,1065,3,1,0,2,0,0,RL,...,Unf,Ex,FuseA,Typ,Detchd,Unf,TA,TA,Y,No Fence
8,10382,,1107,3,2,0,2,235,0,RL,...,BLQ,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,No Fence


In [26]:
# Define a dictionary of hyperparameters to tune for the decision tree model
param_grid = {
    "randomforestclassifier__min_samples_leaf": range(2, 14, 2),
    "randomforestclassifier__min_samples_split": range(3, 12, 2)
}

In [27]:
search = GridSearchCV(pipe, # you have defined this beforehand
                      param_grid, # your parameter grid
                      cv=5, # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use,
                      verbose=1) # we want informative outputs during the training process, try changing it to 2 and see what happens

search.fit(x_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


0,1,2
,estimator,Pipeline(step..._state=123))])
,param_grid,"{'randomforestclassifier__min_samples_leaf': range(2, 14, 2), 'randomforestclassifier__min_samples_split': range(3, 12, 2)}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,2
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['missing', 'Po', ...], ['missing', 'No', ...], ...]"
,dtype,<class 'numpy.int64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,3
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
search.best_params_

{'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 3}

## CrossValidate

In [29]:
cross_val_score(pipe, x_train, y_train, cv=5).mean()

np.float64(0.9528997468911632)

In [30]:
cross_val_score(pipe, x_test, y_test, cv=5).mean()

np.float64(0.9418468731735826)

## Metric Scores

In [31]:
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)

# Function to get the scores for our model(s)
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

# Evaluate the Decision Tree model
dt_scores = evaluate_model(search, x_test, y_test)
dt_scores["Model"] = "Decision Forest"

# Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Forest,0.945205,0.697674,0.909091,0.987952,0.789474,0.842813,0.758603


## Result

In [32]:
x_new = pd.read_csv("test.csv")

In [33]:
testing_data = x_new.set_index('Id')

In [34]:
testing_data["Expensive"] = search.predict(testing_data)

In [35]:
pred = testing_data[["Expensive"]]

In [36]:
pred.to_csv("Submission.csv")