In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../src')

In [4]:
from cleaning import new_color
from cleaning import new_clarity
from cleaning import concating
from cleaning import droping

# KNN Neighbors

### Testing on train 

In [5]:
test_clean = pd.read_csv('../data/train_clean.csv', index_col='Unnamed: 0')
test_clean

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_C,color_NC,clarity_I,clarity_IF,clarity_SI,clarity_VS,clarity_VVS
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,0,0,0,0,1,0,1,0,0,1,0,0
1,1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,0,0,1,0,0,1,0,0,0,1,0,0
2,2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,0,0,1,0,0,0,1,0,0,0,1,0
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,0,0,1,0,0,0,1,0,0,0,0,1
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,0,0,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,0,0,0,1,0,1,0,0,0,0,1,0
40451,40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,0,0,1,0,0,1,0,0,0,1,0,0
40452,40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,0,0,0,0,1,1,0,0,0,0,1,0
40453,40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,1,0,0,0,0,1,0,0,0,0,0,1


In [6]:
# Split data in train and test 

X = test_clean.drop('price', axis=1)
y = test_clean['price']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20)        

In [7]:
# Model

model = KNeighborsRegressor()
parameter_space = {'n_neighbors': np.arange(1, 10),
                   }

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           cv=5)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9])})

In [8]:
grid_search.best_score_

0.1804153217705335

In [9]:
grid_search.best_params_

{'n_neighbors': 2}

In [10]:
knn = grid_search.best_estimator_

In [11]:
y_pred = knn.predict(X_train)

In [12]:
mean_squared_error(y_train, y_pred)

0.21336445537479917

In [13]:
y_pred_test = knn.predict(X_test)

In [14]:
mean_squared_error(y_test, y_pred_test)

0.7512399114139167

### Testing on test

In [15]:
test = pd.read_csv('../data/test.csv')
test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
1,1,0.41,Ideal,E,VS2,61.8,54.0,4.79,4.76,2.95
2,2,0.91,Very Good,E,SI2,62.5,59.0,6.16,6.23,3.87
3,3,0.42,Very Good,G,VS2,62.6,57.0,4.76,4.80,2.99
4,4,0.54,Ideal,G,IF,61.5,56.0,5.28,5.25,3.24
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,Ideal,F,SI1,61.7,56.4,5.26,5.30,3.25
13481,13481,1.12,Premium,H,VS2,60.6,59.0,6.77,6.70,4.08
13482,13482,0.37,Ideal,D,SI1,61.5,57.0,4.63,4.60,2.84
13483,13483,0.54,Good,E,SI1,59.9,63.0,5.25,5.30,3.16


In [16]:
# Working on data

colorless = ['D', 'E', 'F']
near_colorless = ['G', 'H', 'I', 'J']
new_color = new_color(test.color, colorless, near_colorless)

In [17]:
to_concat_one = [test, new_color]
one = concating(to_concat_one)

In [18]:
to_drop_one = ['color']
two = droping(one,to_drop_one)

In [19]:
IF = ['IF']
VVS = ['VVS1', 'VVS2']
VS = ['VS1', 'VS2']
SI = ['SI1', 'SI2']
I = ['I1']
new_clarity = new_clarity(two.clarity, IF, VVS, VS, SI, I)

In [20]:
to_concat_two = [two, new_clarity]
three = concating(to_concat_two)

In [21]:
to_drop_two = ['clarity']
four = droping(three,to_drop_two)

In [22]:
four = four.rename(columns={'colorless': 'color', 'clariness':'clarity'})

In [23]:
four = four[['id', 'carat','cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]

In [24]:
five = pd.get_dummies(four)

In [25]:
five

Unnamed: 0,id,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_C,color_NC,clarity_I,clarity_IF,clarity_SI,clarity_VS,clarity_VVS
0,0,0.33,61.9,55.0,4.44,4.42,2.74,0,0,1,0,0,0,1,0,1,0,0,0
1,1,0.41,61.8,54.0,4.79,4.76,2.95,0,0,1,0,0,1,0,0,0,0,1,0
2,2,0.91,62.5,59.0,6.16,6.23,3.87,0,0,0,0,1,1,0,0,0,1,0,0
3,3,0.42,62.6,57.0,4.76,4.80,2.99,0,0,0,0,1,0,1,0,0,0,1,0
4,4,0.54,61.5,56.0,5.28,5.25,3.24,0,0,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,0,0,1,0,0,1,0,0,0,1,0,0
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,0,0,0,1,0,0,1,0,0,0,1,0
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,0,0,1,0,0,1,0,0,0,1,0,0
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,0,1,0,0,0,1,0,0,0,1,0,0


In [26]:
# Model

y_pred_real_test = knn.predict(five)

In [27]:
five['price'] = y_pred_real_test

In [28]:
to_drop_three = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_C', 'color_NC', 'clarity_I', 'clarity_IF', 'clarity_SI', 'clarity_VS', 'clarity_VVS']
six = droping(five,to_drop_three)

In [29]:
six.to_csv('../data/test_2.csv', index=False)