In [64]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_selector,ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


## Import

In [65]:
data_df = pd.read_csv("datasets\housing-classification-iter-6\housing-classification-iter6.csv")
data_df

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [66]:
#Dirty solution to replace "None" with "NA" in one column to be the same as others
data_df.loc[data_df.MasVnrType == "None","MasVnrType"]="NA"

## Analyze the columns

### Drop un-necessary features

In [67]:
drop_col =["Neighborhood","Condition1","Condition2","Exterior1st","Exterior2nd","MiscVal"
           #,"MoSold"
           ]
for col in drop_col:
    if col in data_df.columns:
        data_df = data_df.drop(columns=col)

In [68]:
def find_columns(all_cat_col,which):
    dict_ordinal={
    "Utilities":["ELO","NoSeWa","NoSewr","AllPub"],
    "LandSlope":["Sev","Mod","Gtl"],
    #"HouseStyle":["1Story","1.5Fin","1.5Unf","2Story","2.5Fin","2.5Unf","SFoyer","SLvl"],
    "RoofMatl":["ClyTile","CompShg","Membran","Metal","Roll","Tar&Grv","WdShake","WdShngl"],   #Not sure
    "MasVnrType":["NA","BrkCmn","BrkFace","CBlock","Stone"],
    "Foundation":["BrkTil","CBlock","Slab","PConc","Wood","Stone"], #Not sure
    "BsmtExposure":["NA","No","Mn","Av","Gd"],
    "BsmtFinType1":["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
    "BsmtFinType2":["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
    "GarageFinish":["NA","Unf","RFn","Fin"],
    "PavedDrive":["N","P","Y"]
    #"Fence":[]
    }
    cat = []
    no_qu_col=[]
    qu_col=[]
    choose_col=[]
    for col in all_cat_col:
            if (col.endswith("Qual")
            or col.endswith("Qu") 
            or col.endswith("QC") 
            or col.endswith("Cond")):
                qu_col.append(col)
            elif col not in list(dict_ordinal.keys()):
                no_qu_col.append(col)
        
    if which == "quality":
        choose_col = qu_col
        cat=["NA","Po","Fa","TA","Gd","Ex"]

    elif which == "notquality":
        choose_col = no_qu_col
    
    elif which == "ordinal":
        choose_col= list(dict_ordinal.keys())
        cat = list(dict_ordinal.values())
        
    return choose_col,cat

In [69]:
data_df.shape

(1460, 75)

## Split

In [70]:
y = data_df.pop("Expensive")
X = data_df.copy()

X_train,X_test,y_train,y_test = train_test_split(X,y
                                                 ,test_size=0.2
                                                 ,random_state=555
                                                 )

## Pipeline

In [71]:
all_cat_cols = list(X_train.select_dtypes(exclude="number"))
qu_cats = find_columns(all_cat_cols,which="quality")
X_cat_qu_col = qu_cats[0]
X_cat_qu_list = qu_cats[1]

In [72]:
noqu_cats = find_columns(all_cat_cols,which="notquality")
X_cat_noqu_col = noqu_cats[0]
category_array=[]
for i in range(len(X_cat_qu_col)):
    category_array.append(X_cat_qu_list)

In [73]:
qu_cats = find_columns(all_cat_cols,which="ordinal")
X_cat_ord_col = qu_cats[0]
X_cat_ord_list = qu_cats[1]

In [74]:
impute_num = SimpleImputer()

impute_cat = SimpleImputer(strategy="constant",fill_value="NA")
encode_cat = OneHotEncoder(sparse_output = False,handle_unknown="ignore") #TODO:Try min_freq
encode_cat_qu = OrdinalEncoder(categories=category_array)
encode_cat_ord = OrdinalEncoder(categories=X_cat_ord_list)

pca = PCA(n_components=0.96)
#model = DecisionTreeClassifier()
model = RandomForestClassifier()

In [75]:
split_cats = ColumnTransformer(transformers=
                                [("encode_cat_qu", encode_cat_qu, X_cat_qu_col),
                                 ("encode_cat", encode_cat, X_cat_noqu_col),
                                 ("encode_cat_ord", encode_cat_ord, X_cat_ord_col)])

pipe_cat_all = Pipeline(steps=[("impute_cat",impute_cat),("split_cats",split_cats)])
pipe_num = Pipeline(steps=[("impute_num",impute_num)])

split_num_cat = ColumnTransformer(transformers=
                                [("pipe_num", pipe_num, make_column_selector(dtype_include="number")),
                                 ("pipe_cat_all", pipe_cat_all, make_column_selector(dtype_exclude="number"))])


pipe_all = Pipeline(steps=[("split_num_cat",split_num_cat)
                           #,("pca",pca)
                           ,("model",model)]).set_output(transform="pandas")

In [76]:
parameters ={
    "split_num_cat__pipe_num__impute_num__strategy":["median","mean"],
    'model__max_depth': range(3, 15),
    'model__min_samples_leaf': range(5, 40 ,2),
    'model__min_samples_split': range(5, 40, 2),
    'model__criterion':['gini']
}

#### Random Search

In [77]:
find = RandomizedSearchCV(
    pipe_all,
    parameters,
    n_iter= 100,
    scoring='accuracy',
    cv = 6,
    verbose= 1,
    random_state=555,
    error_score="raise"
)

In [79]:
find.fit(X_train,y_train)

Fitting 6 folds for each of 100 candidates, totalling 600 fits


In [78]:
find.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [38]:
find.best_score_

0.855300026434047

In [39]:
#TODO: try other scores methods
accuracy_score(y_train,find.predict(X_train))

0.8578767123287672

In [40]:
accuracy_score(y_test,find.predict(X_test))

0.8561643835616438

#### GridSearch

In [20]:
search = GridSearchCV(
    pipe_all,
    parameters,
    scoring ='accuracy',
    cv = 6,
    verbose= 1
)

In [21]:
search.fit(X_train,y_train)

Fitting 6 folds for each of 7776 candidates, totalling 46656 fits


KeyboardInterrupt: 

In [None]:
search.best_params_

In [None]:
accuracy_score(y_train,search.predict(X_train))

In [None]:
accuracy_score(y_test,search.predict(X_test))

### Manual