## 저번주에 드린 아파트 경매 데이터를 EDA(탐색적 데이터 분석)을 통해 분석 및 예측 해보세요
## Cf) 탐색적 데이터 분석 (https://statkclee.github.io/ml/ml-eda.html)
- *Hammer price를 정렬하여 상위 50%는 1, 하위 50%는 2로 변환 후 분류하시면 됩니다
- *모델은 로지스틱 회귀, 나이브 베이즈, SVM, knn을 쓰시면 되며, 모델 예측 외의 필요로 쓰이는 모델은 자유입니다
- *언어는 자유입니다(R , python)
- *참조자료로 드리는 코드 및, 우수과제로 선정된 코드, 저번에 드린 회장님 EDA코드 모두 참조하셔도 되며,
- 부분적으로 copy도 허용합니다
- *전처리와 파생변수 제작, 변수 선택 등에 자신만의 근거가 있으면 좋을 것 같습니다
- *예측 후 모델 간 비교 해보세요
- *grid search는 필수입니다(svm과 knn)
- *정확도를 5-fold로 검증하시면 됩니다
- 다른분들이 데이터를 보고 어떤 고민을 하는지 보고 따라해보시면 금방 느실겁니다

# Data Loading

In [246]:
import pickle
import numpy as np
import pandas as pd

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

with open('AM_train.pickle' , 'rb') as f:
    AM_train = pickle.load(f)
    
print("Shape of AM_train is ", AM_train.shape)

Shape of AM_train is  (1933, 26)


# Feature Selection

In [247]:
feat = ['Auction_class','Claim_price','Auction_count','Auction_miscarriage_count','Total_land_gross_area',
        'Total_land_real_area','Total_land_auction_area','Total_building_area','Total_building_auction_area',
        'Total_appraisal_price','Minimum_sales_price','point.x','point.y', 'Bid_class_개별', 'Bid_class_일괄', 'Bid_class_일반']

# Feature Selection 이 조합이 가장 성능이 좋았음.

In [248]:
X = AM_train[feat].values
y = AM_train['Hammer_price'].values

In [249]:
AM_train.Hammer_price.value_counts()

1    967
0    966
Name: Hammer_price, dtype: int64

In [250]:
# X = AM_train.drop(columns=['Hammer_price']).values
# y = AM_train['Hammer_price'].values
# X.shape, y.shape

In [251]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Modeling - Naive, KNN, SVM, 3Meta Ensemble

## 1. Default모델

In [252]:
from sklearn.ensemble import VotingClassifier

In [259]:
clf1 = GaussianNB()
clf2 = KNeighborsClassifier(n_jobs=-1)
clf3 = SVC()
eclf = VotingClassifier(estimators=[('NB', clf1), ('KNN', clf2), ('SVM', clf3)], voting='hard')

In [260]:
print("Accuracy of NB is",cross_val_score(clf1, X, y, cv=5).mean())

Accuracy of NB is 0.8093053483444574


In [261]:
print("Accuracy of KNN is",cross_val_score(clf2, X, y, cv=5).mean())

Accuracy of KNN is 0.8258000142718587


In [262]:
print("Accuracy of SVM is",cross_val_score(clf3, X, y, cv=5).mean())

Accuracy of SVM is 0.7174643724585155


In [258]:
print("3 Model Ensemble is",cross_val_score(eclf, X, y, cv=5).mean())

  if diff:
  if diff:
  if diff:
  if diff:


3 Model Ensemble is 0.8330512294847212


  if diff:


## 2. Hyperparameter Tuning - GRID SEARCH 실시

In [71]:
algorithmes = [KNeighborsClassifier(), SVC()]
params = []
params.append({'n_neighbors' : [int(i) for i in np.linspace(1, 20, 20)]})
params.append({'kernel': ['rbf'], 'gamma': np.linspace(9,12, 10),
                     'C': np.linspace(170, 180, 10)})

In [72]:
params

[{'n_neighbors': [1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20]},
 {'kernel': ['rbf'],
  'gamma': array([ 9.        ,  9.33333333,  9.66666667, 10.        , 10.33333333,
         10.66666667, 11.        , 11.33333333, 11.66666667, 12.        ]),
  'C': array([170.        , 171.11111111, 172.22222222, 173.33333333,
         174.44444444, 175.55555556, 176.66666667, 177.77777778,
         178.88888889, 180.        ])}]

In [73]:
from sklearn.model_selection import GridSearchCV

scoring = ['accuracy']
estimator_results = []

for i, (estimator, params) in enumerate(zip(algorithmes,params)):
    gs_estimator = GridSearchCV(
            refit="accuracy", estimator=estimator, param_grid=params, scoring=scoring, cv=5, verbose=1, n_jobs=-1)
    print(gs_estimator)

    gs_estimator.fit(X, y)
    estimator_results.append(gs_estimator)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['accuracy'], verbose=1)
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'kernel': ['rbf'], 'gamma': array([ 9.     ,  9.33333,  9.66667, 10.     , 10.33333, 10.66667,
       11.     , 11.33333, 11.66667, 12.     ]), 'C': array([170.     , 171.11111, 172.22222, 173.33333, 174.44444, 175.55556,
       176.66667, 177.77778, 178.88889, 180.     ])},
       pre_dispatch='2*n_jobs', refit='accuracy',
       return_train_score='warn', scoring=['accuracy'], verbose=1)
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    6.5s finished


In [79]:
estimator_results[0].best_score_ # KNN의 가장 좋은 성능

0.8292809105018106

In [77]:
estimator_results[0].best_params_ # k=4일때 가장 좋았다.

{'n_neighbors': 4}

In [75]:
estimator_results[1].best_score_ # SVM의 가장 좋은 성능

0.8867046042421107

In [80]:
estimator_results[1].best_params_ #아래와 같을때 가장 좋음

{'C': 171.11111111111111, 'gamma': 11.333333333333332, 'kernel': 'rbf'}

In [203]:
import pandas as pd
from pandas import DataFrame
from collections import defaultdict

result_df_dict = {}
result_attributes = ["model", "accuracy","n_neighbors", "C", "gamma", "kernel"]
result_dict = defaultdict(list)

algorithm_name= ["KNN", "SVM"]

for i, estimators in enumerate(estimator_results):
    number_of_estimators = len(estimators.cv_results_["mean_fit_time"])

    for idx_estimator in range(number_of_estimators):
        result_dict["model"].append(algorithm_name[i])
        result_dict["accuracy"].append(
            estimators.cv_results_["mean_test_accuracy"][idx_estimator])
            
    for param_value in estimators.cv_results_["params"]:
        for k,v in param_value.items():
            result_dict[k].append(v)
    for attr_name in result_attributes:
        if len(result_dict[attr_name]) < len(result_dict["accuracy"]):
            result_dict[attr_name].extend([None for i in range(number_of_estimators)])

result_df = DataFrame(result_dict, columns=result_attributes)
result_df.sort_values("accuracy",ascending=False).head(n=20)

Unnamed: 0,model,accuracy,n_neighbors,C,gamma,kernel
37,SVM,0.886705,,171.111111,11.333333,rbf
96,SVM,0.886705,,177.777778,11.0,rbf
86,SVM,0.886705,,176.666667,11.0,rbf
47,SVM,0.886705,,172.222222,11.333333,rbf
76,SVM,0.886705,,175.555556,11.0,rbf
57,SVM,0.886705,,173.333333,11.333333,rbf
66,SVM,0.886705,,174.444444,11.0,rbf
106,SVM,0.886705,,178.888889,11.0,rbf
105,SVM,0.886187,,178.888889,10.666667,rbf
38,SVM,0.886187,,171.111111,11.666667,rbf


In [205]:
result_df.loc[result_df['model'] == 'KNN'].sort_values("accuracy",ascending=False)

Unnamed: 0,model,accuracy,n_neighbors,C,gamma,kernel
3,KNN,0.829281,4.0,,,
6,KNN,0.829281,7.0,,,
5,KNN,0.828764,6.0,,,
7,KNN,0.827212,8.0,,,
0,KNN,0.826694,1.0,,,
4,KNN,0.82566,5.0,,,
2,KNN,0.825142,3.0,,,
1,KNN,0.824625,2.0,,,
8,KNN,0.824108,9.0,,,
9,KNN,0.821521,10.0,,,


# 3. 가장 좋았던 Parameter로 Ensemble

In [210]:
best_C = result_df.loc[result_df['model'] == 'SVM'].sort_values("accuracy",ascending=False).iloc[0]['C']
best_gamma = result_df.loc[result_df['model'] == 'SVM'].sort_values("accuracy",ascending=False).iloc[0]['gamma']

In [213]:
clf1 = GaussianNB()
clf2 = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
clf3 = SVC(C = best_C, gamma = best_gamma, kernel='rbf')
eclf = VotingClassifier(estimators=[('NB', clf1), ('KNN', clf2), ('SVM', clf3)], voting='hard')

In [216]:
print("Accuracy of NB is",cross_val_score(clf1, X, y, cv=5).mean())

Accuracy of NB is 0.8093053483444574


In [217]:
print("Accuracy of KNN is",cross_val_score(clf2, X, y, cv=5).mean())

Accuracy of KNN is 0.829452360909076


In [218]:
print("Accuracy of SVM is",cross_val_score(clf3, X, y, cv=5).mean())

Accuracy of SVM is 0.8869011116176655


In [219]:
print("3 Model Ensemble is",cross_val_score(eclf, X, y, cv=5).mean())

  if diff:
  if diff:
  if diff:
  if diff:


3 Model Ensemble is 0.8486220065156417


  if diff:


Ensemble은 성능이 오히려 안좋아 지는 것으로 보인다.

# 4. 얼마나 성능 개선이 되었는가?

In [231]:
de_clf2 = KNeighborsClassifier()
de_clf3 = SVC()
de_clf2_acc = cross_val_score(de_clf2, X, y, cv=5).mean()
de_clf3_acc = cross_val_score(de_clf3, X, y, cv=5).mean()

clf2 = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
clf3 = SVC(C = best_C, gamma = best_gamma, kernel='rbf')

clf2_acc = cross_val_score(clf2, X, y, cv=5).mean()
clf3_acc = cross_val_score(clf3, X, y, cv=5).mean()
print("-----Final Result--------")
print("KNN advanced about", round(clf2_acc - de_clf2_acc, 2)*100, "%")
print("SVM advanced about", round(clf3_acc - de_clf3_acc, 2)*100, "%")
print("-------------------------")

-----Final Result--------
KNN advanced about 0.0 %
SVM advanced about 17.0 %
-------------------------
