In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
import pickle

# para calcular las métricas

from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn import tree

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
plt.rcParams["figure.figsize"] = (10,8)

In [2]:
df_tt = pd.read_csv("../data/preproc.csv", index_col = 0)
df_tt.head()

Unnamed: 0_level_0,carat,cut,color,clarity,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-1.128026,1,0,6,6.353
1,0.669489,0,1,2,9.183
2,-0.064708,0,2,4,7.983
3,0.846708,2,3,6,8.371
4,-0.976124,1,3,3,6.588


In [3]:
X = df_tt.drop('price', axis =1)
y = df_tt['price']

# Pipeline

In [4]:
pipe = Pipeline(steps=[('estimator', LinearRegression())])

In [5]:
params_grid = [{
                'estimator':[LinearRegression()],
                },
                {
                'estimator': [DecisionTreeRegressor()],
                'estimator__max_depth': [1,2,3,4,5],
                'estimator__max_features': ["sqrt", "log2"],
                'estimator__min_samples_split': [10, 50, 100], 
                'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[RandomForestRegressor()],
                 'estimator__n_estimators':[50,100,200,300],
                 'estimator__max_depth':[1,2,3,4,5], 
                 'estimator__min_samples_split':[2,3], 
                 'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[KNeighborsRegressor()],
                 'estimator__n_neighbors':[3,5,6,7],
                }

              ] 

In [6]:
grid = GridSearchCV(pipe, params_grid, n_jobs= -1, verbose = 4)

In [7]:
%%time
res = grid.fit(X, y)

Fitting 5 folds for each of 215 candidates, totalling 1075 fits
[CV 3/5] END ......estimator=LinearRegression();, score=0.775 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.020 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=50;, score=0.727 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=50;, score=0.030 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.024 total time=   0.0s
[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max

[CV 2/5] END ......estimator=LinearRegression();, score=0.770 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.721 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=50;, score=0.725 total time=   0.0s
[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=50;, score=0.717 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=50;, score=0.728 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_le

[CV 4/5] END ......estimator=LinearRegression();, score=0.775 total time=   0.0s
[CV 5/5] END ......estimator=LinearRegression();, score=0.764 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.728 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.026 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.015 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=2, estimator__min_samples_split=10;, score=0.725 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor

[CV 1/5] END ......estimator=LinearRegression();, score=0.764 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.024 total time=   0.0s
[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.019 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=50;, score=0.029 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.728 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=1, estimator__max_features=sqrt, estimator__min_samples_l

[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.859 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.850 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.769 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=10;, score=0.675 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=100;, score=0.859 total time=   0.0s
[CV 5/5] 

[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=sqrt, estimator__min_samples_leaf=3, estimator__min_samples_split=50;, score=0.878 total time=   0.0s
[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=sqrt, estimator__min_samples_leaf=3, estimator__min_samples_split=50;, score=0.846 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=sqrt, estimator__min_samples_leaf=3, estimator__min_samples_split=100;, score=0.866 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=50;, score=0.862 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=50;, score=0.499 total time=   0.0s
[CV 3/5] 

[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=3, estimator__min_samples_split=50;, score=0.708 total time=   0.2s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=3, estimator__min_samples_split=100;, score=0.812 total time=   0.1s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=3, estimator__min_samples_split=100;, score=0.595 total time=   0.1s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=3, estimator__min_samples_split=100;, score=0.768 total time=   0.0s
[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=3, estimator__min_samples_split=100;, score=0.855 total time=   0.0s
[CV 4/

[CV 4/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.300 total time=   0.0s
[CV 5/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=1, estimator__min_samples_split=100;, score=0.703 total time=   0.0s
[CV 1/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=10;, score=0.881 total time=   0.0s
[CV 2/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=10;, score=0.857 total time=   0.0s
[CV 3/5] END estimator=DecisionTreeRegressor(), estimator__max_depth=4, estimator__max_features=log2, estimator__min_samples_leaf=2, estimator__min_samples_split=10;, score=0.849 total time=   0.0s
[CV 4/5]

CPU times: user 2.25 s, sys: 871 ms, total: 3.12 s
Wall time: 5min 59s


In [8]:
best_model = grid.best_params_
best_model

{'estimator': KNeighborsRegressor(), 'estimator__n_neighbors': 5}

In [9]:
knn = KNeighborsRegressor(n_neighbors = 5)

In [10]:
knn.fit(X, y)

KNeighborsRegressor()

# Predicción

In [11]:
X_test = pd.read_csv("../data/test_limpio_.csv", index_col = 0)
X_test.head(2)

Unnamed: 0,carat,cut,color,clarity
0,-1.023184,0,5,5
1,0.94188,1,5,5


In [12]:
y_pred_knn_test = knn.predict(X_test)
y_pred_knn_train = knn.predict(X)

In [13]:
y_pred_knn_test

array([6.2784, 8.3964, 9.3708, ..., 6.6374, 8.589 , 7.8802])

# Validación del Modelo¶

In [14]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y, y_pred_knn_train))
print('Mean Squared Error:', metrics.mean_squared_error(y, y_pred_knn_train))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_knn_train)))
print("R2:",  metrics.r2_score(y, y_pred_knn_train))

Mean Absolute Error: 0.10800830814961607
Mean Squared Error: 0.05299507879613575
Root Mean Squared Error: 0.23020660024451026
R2: 0.9487328185221411


In [15]:
def metricas(y, y_pred_knn_test, y_pred_knn_train, tipo_modelo):
    
    
    resultados = {'Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y, y_pred_knn_train))}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [16]:
KNN_results = metricas(y, y_pred_knn_test, y_pred_knn_train,"KNeighborsRegressor")

In [17]:
KNN_results

Unnamed: 0,0,modelo
0,0.230207,KNeighborsRegressor
1,Root Mean Squared Error:,KNeighborsRegressor
