In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.svm import SVR

In [2]:
data1= pd.read_excel("HW_Data_Set.xlsx")
data = data1.copy()
data = data.replace('?',np.nan)
data = data.dropna()
dummy = pd.get_dummies(data[["ind_109"]])
y = data["90_target"]
X_= data.drop(["ind_109","50_target","90_target","20_target", "ind_420", "ind_422"],axis=1).astype('float64')
X = pd.concat([X_,dummy[["ind_109_GREEN"]]],axis=1)
X.head(),

(   ind_5  ind_6  ind_8      ind_9     ind_10     ind_12     ind_13  ind_14  \
 0   19.0   17.0  100.0  85.714286  14.285714  72.363515  60.808814   23.80   
 1   24.0   19.0  100.0  78.571429  21.428571  74.275883  64.366798   11.45   
 2   30.0   24.0  100.0  71.428571  28.571429  75.140402  65.915803    8.75   
 3   37.0   30.0  100.0  64.285714  35.714286  76.677846  68.584234    7.80   
 4   41.0   37.0  100.0  57.142857  42.857143  81.603007  76.455495   14.90   
 
    ind_15  ind_16  ...  ind_407  ind_410   ind_412  ind_414  ind_416  ind_418  \
 0   17.62   11.73  ...    -23.0    -27.0  0.002371      2.0    -49.6    -54.0   
 1   18.16   12.22  ...    -16.0    -31.0  0.003074      0.8    -55.6    -60.0   
 2   17.86   12.28  ...    -10.0    -30.0  0.003561      2.7    -58.4    -60.0   
 3   14.76   12.61  ...    -10.0    -30.0  0.004366      0.9    -61.8    -65.0   
 4   11.92   14.25  ...    -24.0    -45.0  0.004198      0.1    -79.8    -86.0   
 
    ind_424   ind_426   ind_42

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [4]:
svr = SVR().fit(X_train, y_train)

In [5]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [6]:
knn_model

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [7]:
knn_model.n_neighbors

5

In [8]:
knn_model.effective_metric_

'euclidean'

# TAHMİN

In [9]:
y_pred = knn_model.predict(X_test)

In [10]:
np.sqrt(mean_squared_error(y_test, y_pred))

31.648571263592384

In [11]:
RMSE = [] 

for k in range(10):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_train) 
    rmse = np.sqrt(mean_squared_error(y_train,y_pred)) 
    RMSE.append(rmse) 
    print("k =" , k , "için RMSE değeri: ", rmse)


k = 1 için RMSE değeri:  0.0
k = 2 için RMSE değeri:  16.107412891395956
k = 3 için RMSE değeri:  20.828037246080104
k = 4 için RMSE değeri:  23.27227688325072
k = 5 için RMSE değeri:  24.96107788011554
k = 6 için RMSE değeri:  26.22110292309045
k = 7 için RMSE değeri:  27.11706121301397
k = 8 için RMSE değeri:  27.956045435801823
k = 9 için RMSE değeri:  28.44296403820427
k = 10 için RMSE değeri:  29.053832635379266


# MODEL TUNING

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
knn_params = {'n_neighbors': np.arange(1,30,1)}

In [14]:
knn = KNeighborsRegressor()

In [15]:
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10)

In [16]:
knn_cv_model.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [17]:
knn_cv_model.best_params_["n_neighbors"]

3

In [18]:
RMSE = [] 
RMSE_CV = []
for k in range(10):
    k = k+1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_train) 
    rmse = np.sqrt(mean_squared_error(y_train,y_pred)) 
    rmse_cv = np.sqrt(-1*cross_val_score(knn_model, X_train, y_train, cv=10, 
                                         scoring = "neg_mean_squared_error").mean())
    RMSE.append(rmse) 
    RMSE_CV.append(rmse_cv)
    print("k =" , k , "için RMSE değeri: ", rmse, "RMSE_CV değeri: ", rmse_cv )


k = 1 için RMSE değeri:  0.0 RMSE_CV değeri:  33.31457615603446
k = 2 için RMSE değeri:  16.107412891395956 RMSE_CV değeri:  31.591047430594166
k = 3 için RMSE değeri:  20.828037246080104 RMSE_CV değeri:  31.365780453717157
k = 4 için RMSE değeri:  23.27227688325072 RMSE_CV değeri:  31.62506940439913
k = 5 için RMSE değeri:  24.96107788011554 RMSE_CV değeri:  31.644917423862672
k = 6 için RMSE değeri:  26.22110292309045 RMSE_CV değeri:  31.92069667062682
k = 7 için RMSE değeri:  27.11706121301397 RMSE_CV değeri:  32.12622905044111
k = 8 için RMSE değeri:  27.956045435801823 RMSE_CV değeri:  32.26590266580473
k = 9 için RMSE değeri:  28.44296403820427 RMSE_CV değeri:  32.43784640348842
k = 10 için RMSE değeri:  29.053832635379266 RMSE_CV değeri:  32.68287537398246


In [19]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"])

In [20]:
knn_tuned.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='uniform')

In [21]:
np.sqrt(mean_squared_error(y_test, knn_tuned.predict(X_test)))

31.53425201190068