In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

In [21]:
train_df = pd.read_csv(
    './22-2-dataanalysis-regression/train_regression.csv')
test_df = pd.read_csv(
    './22-2-dataanalysis-regression/test_regression.csv')

## Data preprocessing

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   acid0       2000 non-null   float64
 1   acid1       2000 non-null   float64
 2   acid2       2000 non-null   float64
 3   chlorides   2000 non-null   float64
 4   sulphates0  2000 non-null   float64
 5   sulphates1  2000 non-null   float64
 6   sulphates2  2000 non-null   float64
 7   density     2000 non-null   float64
 8   score       2000 non-null   float64
dtypes: float64(9)
memory usage: 140.8 KB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   acid0       2000 non-null   float64
 1   acid1       2000 non-null   float64
 2   acid2       2000 non-null   float64
 3   chlorides   2000 non-null   float64
 4   sulphates0  2000 non-null   float64
 5   sulphates1  2000 non-null   float64
 6   sulphates2  2000 non-null   float64
 7   density     2000 non-null   float64
dtypes: float64(8)
memory usage: 125.1 KB


In [5]:
train_df.columns.values

array(['acid0', 'acid1', 'acid2', 'chlorides', 'sulphates0', 'sulphates1',
       'sulphates2', 'density', 'score'], dtype=object)

## KNN

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor

In [26]:
feature_col = ['acid0', 'acid1', 'acid2', 'chlorides', 'sulphates0', 'sulphates1',
               'sulphates2', 'density']
label_col = ['score']

X = train_df[feature_col]
y = train_df[label_col]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

In [30]:
knn = KNeighborsRegressor(
    n_neighbors=6, weights='distance', p=1, algorithm='ball_tree')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [31]:
mean_squared_error(y_test, y_pred) ** 0.5

11.08054040797378

Hyperparameter tuning

In [22]:
feature_col = ['acid0', 'acid1', 'acid2', 'chlorides', 'sulphates0', 'sulphates1',
               'sulphates2', 'density']
label_col = ['score']

X = train_df[feature_col]
y = train_df[label_col]


In [23]:
pipe = make_pipeline(
    OrdinalEncoder(),
    KNeighborsRegressor()
)

dists = {
    'kneighborsregressor__n_neighbors': [2, 3, 4, 5, 6],
    'kneighborsregressor__weights': ['uniform', 'distance'],
    'kneighborsregressor__algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'kneighborsregressor__p': [1, 2, 3, 4],
}

clf1 = RandomizedSearchCV(
    pipe,
    param_distributions=dists,
    n_iter=120,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    random_state=42
)


In [24]:
clf1.fit(X, y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('ordinalencoder',
                                              OrdinalEncoder()),
                                             ('kneighborsregressor',
                                              KNeighborsRegressor())]),
                   n_iter=120,
                   param_distributions={'kneighborsregressor__algorithm': ['ball_tree',
                                                                           'kd_tree',
                                                                           'brute'],
                                        'kneighborsregressor__n_neighbors': [2,
                                                                             3,
                                                                             4,
                                                                             5,
                                                                             6],
              

In [25]:
print(clf1.best_params_)
print(clf1.best_score_)

{'kneighborsregressor__weights': 'distance', 'kneighborsregressor__p': 1, 'kneighborsregressor__n_neighbors': 6, 'kneighborsregressor__algorithm': 'ball_tree'}
-11.884440134269795


실제 검증

In [34]:
feature_col = ['acid0', 'acid1', 'acid2', 'chlorides', 'sulphates0', 'sulphates1',
               'sulphates2', 'density']
label_col = ['score']

X_train = train_df[feature_col]
y_train = train_df[label_col]

X_test = test_df

knn = KNeighborsRegressor(
    n_neighbors=6, weights='distance', p=1, algorithm='ball_tree')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

y_pred_df = pd.DataFrame()
y_pred_df['Id'] = np.arange(2000)
y_pred_df['Predicted'] = y_pred

y_pred_df.to_csv('KNN_reg.csv', index=False)