In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV


from sklearn.metrics import accuracy_score

In [15]:
data_df = pd.read_csv("datasets\housing-classification-iter-0-2\housing-classification-iter-0-2.csv")
data_df.describe()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,70.049958,1057.429452,2.866438,0.613014,2.758904,1.767123,94.244521,15.060959,0.14863
std,9981.264932,24.284752,438.705324,0.815778,0.644666,40.177307,0.747315,125.338794,55.757415,0.355845
min,1300.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7553.5,59.0,795.75,2.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,9478.5,69.0,991.5,3.0,1.0,0.0,2.0,0.0,0.0,0.0
75%,11601.5,80.0,1298.25,3.0,1.0,0.0,2.0,168.0,0.0,0.0
max,215245.0,313.0,6110.0,8.0,3.0,738.0,4.0,857.0,480.0,1.0


In [16]:
data_df = data_df.drop(columns=["PoolArea","ScreenPorch"])

In [17]:
y = data_df.pop("Expensive")
X = data_df

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

#### Pipeline

In [19]:
my_imputer = SimpleImputer()
my_scaler = StandardScaler()
my_model = KNeighborsClassifier()

In [24]:
pipe = Pipeline(steps=[('impute', my_imputer),('scale', my_scaler), ('classify', my_model)])

#### parameters Range

In [83]:
param_range = {
    "impute__strategy":["median","mean"],
    "scale__with_mean":[True],
    "scale__with_std":[True],
    "classify__n_neighbors" : range(10,15),
    "classify__weights" : ['uniform'],
    "classify__algorithm" : ['auto']
}

#### Random Search

In [73]:
find = RandomizedSearchCV(
    pipe,
    param_range,
    n_iter= 100,
    scoring='accuracy',
    cv = 5,
    verbose= 1,
    random_state=123
)

In [74]:
find.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [75]:
find.best_params_

{'scale__with_std': True,
 'scale__with_mean': True,
 'impute__strategy': 'mean',
 'classify__weights': 'distance',
 'classify__n_neighbors': 14,
 'classify__algorithm': 'auto'}

#### Grid Search

In [84]:
search = GridSearchCV(
    pipe,
    param_range,
    scoring='accuracy',
    cv = 5,
    verbose= 1
)

In [85]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [86]:
search.best_params_

{'classify__algorithm': 'auto',
 'classify__n_neighbors': 13,
 'classify__weights': 'uniform',
 'impute__strategy': 'median',
 'scale__with_mean': True,
 'scale__with_std': True}

In [87]:
search.best_score_

0.9195260628737024

In [88]:
accuracy_score(y_train,search.predict(X_train))

0.9238013698630136

In [89]:
accuracy_score(y_test,search.predict(X_test))

0.9315068493150684