# Técnicas de otimização e ajuste fino

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

### Overfitting, Underfitting e generalização 

### Validação cruzada

In [2]:
df = pd.read_csv('house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
df = df[['price', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront']]

In [4]:
X = df.drop('price', axis=1)
y = df['price']

In [8]:
#Padronizando a escala dos valores numéricos do dataset
min_max_scaler = StandardScaler() 
X = min_max_scaler.fit_transform(X)


In [9]:
lr = LinearRegression()

In [10]:
#Definindo o número de folds
k = 5

In [12]:
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [13]:
#Armazenará os scores de cada fold
mape_score = []

In [15]:
for train_index, test_index in kf.split(X):
    X_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Treinando o modelo no conjunto de treino
    lr.fit(X_train, y_train)
    
    #Fazendo previsões no condjunto de validação
    y_pred = lr.predict(x_test)
    
    #Caculando o erro percentula absoluto medio MAPE
    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    mape_score.append(mape)
    
mape_score

[0.41840919858095665,
 0.42906203759883993,
 0.41134839429415065,
 0.4188623637607351,
 0.4328839105767672]

In [16]:
mape_mean = np.mean(mape_score)

print(f'MAPE medio : {mape_mean}')

MAPE medio : 0.4221131809622899


### Separando um conjunto de 10% dos dados para um teste final

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
lr = LinearRegression()

#Número de folds
k = 5

In [19]:
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [20]:
#Armazenará os scores de cada fold
mape_score = []

In [21]:
for train_index, test_index in kf.split(X_train, y_train):
    X_train_cv, x_test_cv = X[train_index], X[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]
    
    #Treinando o modelo no conjunto de treino
    lr.fit(X_train_cv, y_train_cv)
    
    #Fazendo previsões no condjunto de validação
    y_pred = lr.predict(x_test_cv)
    
    #Caculando o erro percentula absoluto medio MAPE
    mape = mean_absolute_percentage_error(y_test_cv, y_pred)
    
    mape_score.append(mape)
    
mape_score

[0.4307664291904637,
 0.4255391609115921,
 0.41071971089711573,
 0.42886309776887593,
 0.43149647105725536]

In [22]:
mape_mean = np.mean(mape_score)

print(f'MAPE medio : {mape_mean}')

MAPE medio : 0.4254769739650605


In [24]:
y_pred_2 = lr.predict(X_test)

In [26]:
mape_test = mean_absolute_percentage_error(y_test, y_pred_2)
mape_test

0.42909271482568107

### Ajustes de hiperparâmetros

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [28]:
parameters = {'kernel' : ('linear', 'rbf'), 'C':[1, 10]}

In [30]:
from sklearn.svm import SVR
svr = SVR()

In [31]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svr, parameters)

In [32]:
clf.fit(X_train, y_train)

In [33]:
df_results = pd.DataFrame(clf.cv_results_)
df_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.275031,0.056353,0.539141,0.015217,1,linear,"{'C': 1, 'kernel': 'linear'}",-0.027778,-0.023031,-0.036524,-0.021271,-0.030588,-0.027838,0.005464,2
1,7.105284,0.066394,5.522538,0.065668,1,rbf,"{'C': 1, 'kernel': 'rbf'}",-0.057774,-0.05132,-0.063364,-0.051429,-0.061851,-0.057148,0.005056,4
2,5.065359,0.044133,0.523676,0.014756,10,linear,"{'C': 10, 'kernel': 'linear'}",0.114536,0.110037,0.10053,0.115728,0.11629,0.111424,0.005873,1
3,7.031441,0.036084,5.480327,0.018424,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-0.04025,-0.033184,-0.046283,-0.031215,-0.040194,-0.038225,0.005431,3


In [34]:
clf.best_estimator_

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Definindo os parâmetros a serem ajustados
parameters = {'kernel' : ('linear', 'rbf'), 'C':[1, 10]}

# Criando o modelo
svr = SVR()

#Ajuste ifno com validação cruzada
clf = GridSearchCV(svr, parameters, cv=10)

#Treinando o modelo com otimização
clf.fit(X_train, y_train)

KeyboardInterrupt: 