In [337]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_selector,ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


## Import

In [338]:
data_df = pd.read_csv("datasets\housing-classification-iter-6\housing-classification-iter6.csv")
data_df

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [339]:
# check if some columns are identical or almost identical values with another column
for col in list(data_df.columns):
    for col2 in list(data_df.columns):
        if (((data_df[col] == data_df[col2]).sum() >= (data_df.shape[0]-50)) and (col != col2)):
            print(col,col2)


PoolArea LowQualFinSF
PoolArea 3SsnPorch
LowQualFinSF PoolArea
LowQualFinSF 3SsnPorch
3SsnPorch PoolArea
3SsnPorch LowQualFinSF


In [310]:
# Check if some numerical columns has very little variations Empty or
for col in list(data_df.columns):
    if (data_df[col].dtype != "object") and (len(data_df[col].unique()) <= 10):
        print (col)

BedroomAbvGr
Fireplaces
PoolArea
GarageCars
Expensive
OverallQual
OverallCond
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
KitchenAbvGr
YrSold


In [312]:
# List of columns comming from the text file
text_l = ["MSSubClass",
"MSZoning",
"LotFrontage",
"LotArea",
"Street",
"Alley",
"LotShape",
"LandContour",
"Utilities",
"LotConfig",
"LandSlope",
"Neighborhood",
"Condition1",
"Condition2",
"BldgType",
"HouseStyle",
"OverallQual",
"OverallCond",
"YearBuilt",
"YearRemodAdd",
"RoofStyle",
"RoofMatl",
"Exterior1st",
"Exterior2nd",
"MasVnrType",
"MasVnrArea",
"ExterQual",
"ExterCond",
"Foundation",
"BsmtQual",
"BsmtCond",
"BsmtExposure",
"BsmtFinType1",
"BsmtFinSF1",
"BsmtFinType2",
"BsmtFinSF2",
"BsmtUnfSF",
"TotalBsmtSF",
"Heating",
"HeatingQC",
"CentralAir",
"Electrical",
"1stFlrSF",
"2ndFlrSF",
"LowQualFinSF",
"GrLivArea",
"BsmtFullBath",
"BsmtHalfBath",
"FullBath",
"HalfBath",
"BedroomAbvGr",
"KitchenAbvGr",
"KitchenQual",
"TotRmsAbvGrd",
"Functional",
"Fireplaces",
"FireplaceQu",
"GarageType",
"GarageYrBlt",
"GarageFinish",
"GarageCars",
"GarageArea",
"GarageQual",
"GarageCond",
"PavedDrive",
"WoodDeckSF",
"OpenPorchSF",
"EnclosedPorch",
"ScreenPorch",
"PoolArea",
"PoolQC",
"Fence",
"MiscFeature",
"MiscVal",
"MoSold",
"YrSold",
"SaleType",
"SaleCondition"]

In [313]:
# check for columns that exist in the data but not in the documentation file
for col in list(data_df.columns):
    if col not in text_l:
        print(col)

Expensive
3SsnPorch
Id


In [314]:
data_df["3SsnPorch"].unique()

array([  0, 320, 407, 130, 180, 168, 140, 508, 238, 245, 196, 144, 182,
       162,  23, 216,  96, 153, 290, 304], dtype=int64)

## Analyze the columns

In [359]:
def prepare_data(df):
    drop_col =[
        "Id",
        "Condition1",
        "Condition2",
        "Exterior1st",
        "Exterior2nd",
        "MiscVal",
        #"3SsnPorch"
        #"MoSold"
        ]
    for col in drop_col:
        if col in df.columns:
            df = df.drop(columns=col)
            
    #Dirty solution to replace "None" with "NA" in one column to be the same as others
    df.loc[df.MasVnrType == "None","MasVnrType"] = "NA"
    
    return df

In [317]:
def find_columns(all_cat_col,which):
    dict_ordinal={
    "Utilities":["NA","ELO","NoSeWa","NoSewr","AllPub"],
    "LandSlope":["NA","Sev","Mod","Gtl"],
    #"HouseStyle":["1Story","1.5Fin","1.5Unf","2Story","2.5Fin","2.5Unf","SFoyer","SLvl"],
    "RoofMatl":["NA","ClyTile","CompShg","Membran","Metal","Roll","Tar&Grv","WdShake","WdShngl"],   #Not sure
    "MasVnrType":["NA","BrkCmn","BrkFace","CBlock","Stone"],
    "Foundation":["NA","BrkTil","CBlock","Slab","PConc","Wood","Stone"], #Not sure
    "BsmtExposure":["NA","No","Mn","Av","Gd"],
    "BsmtFinType1":["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
    "BsmtFinType2":["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],
    "GarageFinish":["NA","Unf","RFn","Fin"],
    "PavedDrive":["NA","N","P","Y"]
    #"Fence":[]
    }
    cat = []
    no_qu_col=[]
    qu_col=[]
    choose_col=[]
    for col in all_cat_col:
            if (col.endswith("Qual")
            or col.endswith("Qu") 
            or col.endswith("QC") 
            or col.endswith("Cond")):
                qu_col.append(col)
            elif col not in list(dict_ordinal.keys()):
                no_qu_col.append(col)
        
    if which == "quality":
        choose_col = qu_col
        cat=["NA","Po","Fa","TA","Gd","Ex"]

    elif which == "notquality":
        choose_col = no_qu_col
    
    elif which == "ordinal":
        choose_col= list(dict_ordinal.keys())
        cat = list(dict_ordinal.values())
        
    return choose_col,cat

In [None]:
data_df = prepare_data(data_df)

In [318]:
data_df.shape

(1460, 75)

## Split

In [319]:
y = data_df.pop("Expensive")
X = data_df.copy()

X_train,X_test,y_train,y_test = train_test_split(X,y
                                                 ,test_size=0.2
                                                 ,random_state=555
                                                 )

## Pipeline

In [320]:
all_cat_cols = list(X_train.select_dtypes(exclude="number"))
qu_cats = find_columns(all_cat_cols,which="quality")
X_cat_qu_col = qu_cats[0]
X_cat_qu_list = qu_cats[1]

In [321]:
noqu_cats = find_columns(all_cat_cols,which="notquality")
X_cat_noqu_col = noqu_cats[0]
category_array=[]
for i in range(len(X_cat_qu_col)):
    category_array.append(X_cat_qu_list)

In [322]:
qu_cats = find_columns(all_cat_cols,which="ordinal")
X_cat_ord_col = qu_cats[0]
X_cat_ord_list = qu_cats[1]

#### Random Search

In [None]:
parameters ={
    "split_num_cat__pipe_num__impute_num__strategy":["median","mean"],
    'model__max_depth': range(3, 15),
    'model__min_samples_leaf': range(5, 40 ,2),
    'model__min_samples_split': range(5, 40, 2),
    'model__criterion':['gini']
}

In [340]:
impute_num = SimpleImputer()

impute_cat = SimpleImputer(strategy="constant",fill_value="NA")
encode_cat = OneHotEncoder(drop="first",sparse_output = False,handle_unknown="infrequent_if_exist",min_frequency=6)
encode_cat_qu = OrdinalEncoder(categories=category_array)
encode_cat_ord = OrdinalEncoder(categories=X_cat_ord_list)

pca = PCA(n_components=0.96)
#model = DecisionTreeClassifier()
model = RandomForestClassifier()

In [341]:
split_cats = ColumnTransformer(transformers=
                                [("encode_cat_qu", encode_cat_qu, X_cat_qu_col),
                                 ("encode_cat", encode_cat, X_cat_noqu_col),
                                 ("encode_cat_ord", encode_cat_ord, X_cat_ord_col)])

pipe_cat_all = Pipeline(steps=[("impute_cat",impute_cat),("split_cats",split_cats)])
pipe_num = Pipeline(steps=[("impute_num",impute_num)])

split_num_cat = ColumnTransformer(transformers=
                                [("pipe_num", pipe_num, make_column_selector(dtype_include="number")),
                                 ("pipe_cat_all", pipe_cat_all, make_column_selector(dtype_exclude="number"))])


pipe_all = Pipeline(steps=[("split_num_cat",split_num_cat)
                           #,("pca",pca)
                           ,("model",model)]).set_output(transform="pandas")

In [343]:
find = RandomizedSearchCV(
    pipe_all,
    parameters,
    n_iter= 50,
    scoring='accuracy',
    cv = 6,
    #verbose= 1,
    random_state=555,
    error_score="raise"
)

In [344]:
find.fit(X_train,y_train)

Fitting 6 folds for each of 50 candidates, totalling 300 fits




In [345]:
find.best_params_

{'split_num_cat__pipe_num__impute_num__strategy': 'mean',
 'model__min_samples_split': 5,
 'model__min_samples_leaf': 7,
 'model__max_depth': 14,
 'model__criterion': 'gini'}

In [346]:
find.best_score_

0.9477839457220901

In [347]:

accuracy_score(y_train,find.predict(X_train))

0.9691780821917808

In [349]:
accuracy_score(y_test,find.predict(X_test))



0.9486301369863014

#### GridSearch

In [20]:
search = GridSearchCV(
    pipe_all,
    parameters,
    scoring ='accuracy',
    cv = 6,
    verbose= 1
)

In [21]:
search.fit(X_train,y_train)

Fitting 6 folds for each of 7776 candidates, totalling 46656 fits


KeyboardInterrupt: 

In [None]:
search.best_params_

In [None]:
accuracy_score(y_train,search.predict(X_train))

In [None]:
accuracy_score(y_test,search.predict(X_test))

##### Submit

In [None]:
search.best_estimator_.fit(X,y)

In [None]:
test_data = pd.read_csv("datasets\\test-housing-classification.csv")

In [None]:
id_col = test_data.pop("Id")

In [None]:
test_data = prepare_data(test_data)

In [None]:
y_predict = search.best_estimator_.predict(test_data)

In [None]:
sol_df= pd.DataFrame({"Id":id_col,
                      "Expensive":y_predict})

In [None]:
sol_df.Expensive.value_counts()

0    1286
1     173
Name: Expensive, dtype: int64

In [None]:
sol_df.to_csv("prediction_file.csv",index=False)

### Manual Trials

In [409]:
impute_num = SimpleImputer(strategy="mean")

impute_cat = SimpleImputer(strategy="constant",fill_value="NA")
encode_cat = OneHotEncoder(drop="first",sparse_output = False,handle_unknown="infrequent_if_exist",min_frequency=6)
encode_cat_qu = OrdinalEncoder(categories=category_array)
encode_cat_ord = OrdinalEncoder(categories=X_cat_ord_list)

pca = PCA(n_components=0.96)
#model = DecisionTreeClassifier()
model = RandomForestClassifier(max_depth=11,min_samples_leaf=5,min_samples_split=31,n_estimators=100,random_state=555)

In [410]:
impute_num = SimpleImputer(strategy="mean")

impute_cat = SimpleImputer(strategy="constant",fill_value="NA")
encode_cat = OneHotEncoder(drop="first",sparse_output = False,handle_unknown="infrequent_if_exist",min_frequency=6)
encode_cat_qu = OrdinalEncoder(categories=category_array)
encode_cat_ord = OrdinalEncoder(categories=X_cat_ord_list)

pca = PCA(n_components=0.96)
#model = DecisionTreeClassifier()
model = RandomForestClassifier(max_depth=11,min_samples_leaf=5,min_samples_split=31)

In [411]:
split_cats = ColumnTransformer(transformers=
                                [("encode_cat_qu", encode_cat_qu, X_cat_qu_col),
                                 ("encode_cat", encode_cat, X_cat_noqu_col),
                                 ("encode_cat_ord", encode_cat_ord, X_cat_ord_col)])

pipe_cat_all = Pipeline(steps=[("impute_cat",impute_cat),("split_cats",split_cats)])
pipe_num = Pipeline(steps=[("impute_num",impute_num)])

split_num_cat = ColumnTransformer(transformers=
                                [("pipe_num", pipe_num, make_column_selector(dtype_include="number")),
                                 ("pipe_cat_all", pipe_cat_all, make_column_selector(dtype_exclude="number"))])


pipe_all = Pipeline(steps=[("split_num_cat",split_num_cat)
                           #,("pca",pca)
                           ,("model",model)]).set_output(transform="pandas")

In [412]:
pipe_all.fit(X_train,y_train)

In [413]:
accuracy_score(y_train,pipe_all.predict(X_train))

0.961472602739726

In [414]:
accuracy_score(y_test,pipe_all.predict(X_test))



0.9486301369863014

##### Submit

In [415]:
pipe_all.fit(X,y)

In [416]:
test_data = pd.read_csv("datasets\\test-housing-classification.csv")

In [417]:
id_col = test_data.pop("Id")

In [418]:
test_data = prepare_data(test_data)

In [419]:
y_predict = pipe_all.predict(test_data)



In [420]:
sol_df= pd.DataFrame({"Id":id_col,
                      "Expensive":y_predict})

In [421]:
sol_df.Expensive.value_counts()

0    1285
1     174
Name: Expensive, dtype: int64

In [422]:
sol_df.to_csv("prediction_file.csv",index=False)