In [1]:
import sklearn
import pandas as pd
print('scikit-learn is installed (version: {})'.format(sklearn.__version__))

scikit-learn is installed (version: 1.2.2)


In [2]:
!pip install scikit-learn --upgrade



In [3]:
from sklearn.model_selection import train_test_split
df = pd.read_csv('housing-classification-iter6.csv').set_index('Id')
test_data = pd.read_csv('test-housing-classification.csv').set_index('Id')

In [4]:
X=df
y=X.pop('Expensive')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [6]:
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()

In [7]:
test_data_cat=test_data.select_dtypes(exclude="number").copy()
test_data_num=test_data.select_dtypes(include="number").copy()

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore")
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, StandardScaler(with_mean=False),
                              DecisionTreeClassifier())

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 26, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 26, 2),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=25,
                      verbose=1)

search.fit(X_train, y_train)

 
scores = {"dtree" : search.best_score_}

scores

In [88]:
from sklearn.metrics import accuracy_score
accuracy_score(search.predict(X_test), y_test)

0.9383561643835616

In [89]:
y_test_data6=search.predict(test_data)


In [90]:
y_test_data6

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [91]:
test_data['Expensive']=y_test_data6

In [92]:
test_data6 = test_data.loc[:,['Expensive']]

In [93]:
test_data6.reset_index(inplace=True)

In [94]:
test_data6.head(20)

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
5,1466,0
6,1467,0
7,1468,0
8,1469,0
9,1470,0


In [95]:
test_data6.to_csv('test_data6.csv', index=False)

In [86]:
# solution
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  KNeighborsClassifier()
                                 )
from sklearn.model_selection import GridSearchCV

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 50,2),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

knn_search = GridSearchCV(knn_full_pipeline,
                      param_grid,
                      cv=20,
                      verbose=1)

knn_search.fit(X_train, y_train)

scores["knn"] = knn_search.best_score_

scores

Fitting 20 folds for each of 192 candidates, totalling 3840 fits


{'dtree': 0.9307402031930334, 'knn': 0.9292540792540793}

In [87]:
from sklearn.metrics import accuracy_score
accuracy_score(knn_search.predict(X_test), y_test)

0.9315068493150684

In [None]:
from sklearn.linear_model import LogisticRegression

knn_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  LogisticRegression()
                                 )
from sklearn.model_selection import GridSearchCV

param_grid1 = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    'logisticregression__max_iter': [20, 50, 100, 200, 500, 1000],                      
    'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],   
    'logisticregression__class_weight': ['balanced'] 
}

lr_search = GridSearchCV(knn_full_pipeline,
                      param_grid1,
                      cv=20,
                      verbose=1)

lr_search.fit(X_train, y_train)

scores["lr"] = lr_search.best_score_

scores

In [110]:
from sklearn.metrics import accuracy_score
accuracy_score(lr_search.predict(X_test), y_test)

0.9383561643835616

In [103]:
y_test_data7=lr_search.predict(test_data)


In [104]:
test_data['Expensive']=y_test_data7

In [105]:
test_data7 = test_data.loc[:,['Expensive']]

In [106]:
test_data7.reset_index(inplace=True)

In [107]:
test_data7.head()

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0


In [108]:
test_data7.to_csv('test_data7.csv', index=False)

In [112]:

from sklearn.svm import SVC

cvs_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  SVC()
                                 )
from sklearn.model_selection import GridSearchCV

param_grid2 = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
}

csv_search = GridSearchCV(cvs_full_pipeline,
                      param_grid2,
                      cv=20,
                      verbose=1)

csv_search.fit(X_train, y_train)

scores["csv"] = csv_search.best_score_

scores

Fitting 20 folds for each of 2 candidates, totalling 40 fits


{'dtree': 0.9307402031930334,
 'knn': 0.9292540792540793,
 'lr': 0.9422610722610723,
 'csv': 0.8516317016317018}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
rand_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  RandomForestClassifier()
                                 )
from sklearn.model_selection import GridSearchCV

param_grid3 = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean","median"],
    "randomforestclassifier__bootstrap": [True, False],
    "randomforestclassifier__max_depth": [10, 20,100,None],
    "randomforestclassifier__min_samples_leaf": [ 2, 5],
    "randomforestclassifier__min_samples_split": [2, 5, 10],
    "randomforestclassifier__n_estimators": [200,400,100]
}

rand_search = GridSearchCV(rand_full_pipeline,
                      param_grid3,
                      cv=5,
                      verbose=1)

rand_search.fit(X_train, y_train)

scores["rand"] = rand_search.best_score_

scores

In [13]:
rand_search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 200}

In [14]:
 
scores = {"rand" : rand_search.best_score_}

scores

{'rand': 0.9528081734536904}

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(rand_search.predict(X_test), y_test)

0.952054794520548

In [16]:
y_test_data9=rand_search.predict(test_data)
test_data['Expensive']=y_test_data9
test_data9 = test_data.loc[:,['Expensive']]
test_data9.reset_index(inplace=True)
test_data9.head()

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0


In [18]:
test_data9.to_csv('test_data9.csv', index=False)# it is giving the best score for outside test data 0.9774

In [None]:
scores

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
rand_full_pipeline = make_pipeline(preprocessor,
                                  StandardScaler(with_mean=False),
                                  RandomForestClassifier()
                                 )
from sklearn.model_selection import GridSearchCV

param_grid3 = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
  
}

rand_search = GridSearchCV(rand_full_pipeline,
                      param_grid3,
                      cv=10,
                      verbose=1)

rand_search.fit(X_train, y_train)

scores["rand"] = rand_search.best_score_

scores