# Ridge Regressor

## 1. Importación de librerías, establecimiento de directorio de trabajo y carga de datos

In [1]:
# librerias principales para uso y visualización de datos
import os
import numpy as np
import pandas as pd
import warnings  
warnings.filterwarnings('ignore')

# dividir datos para modelo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# algoritmo
from sklearn.linear_model import Ridge

# metricas de rendimiento
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

# guardar el modelo
import pickle

# para visualización
import seaborn as sns  
import matplotlib.pyplot as plt  
from mpl_toolkits.mplot3d import Axes3D  
%matplotlib inline  

os.chdir("C:/Users/aleja/OneDrive - Universidad Politécnica de Madrid/Universidad/Asignaturas/Cuarto/Trabajo de Fin de Grado")

In [2]:
accesos = pd.read_csv("Resultados/Accesos_Calidad_Vida.csv", index_col="Unnamed: 0")
accesos.head()

Unnamed: 0,latitud,longitud,edad,calidad_vida
0,41.621468,2.068474,11,52.885748
1,41.60027,2.085002,12,74.793875
2,41.616524,2.089927,12,79.858657
3,41.61882,2.08948,11,79.858657
4,41.618908,2.089475,12,79.858657


## 2. Construcción de la matriz de características (X) y el vector variable dependiente (y)

In [3]:
X = accesos.iloc[:, :-1].values
y = accesos.iloc[:, -1].values

## 3. División del dataset en set de entrenamiento y de test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## 4. Feature scaling of X (rango: -3 a 3)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## 5. Construcción y entrenamiento del modelo de regresión sobre el set de entrenamiento

In [6]:
regressor = Ridge()
regressor.fit(X_train, y_train)

Ridge()

## 6. Predicción de resultados del set de test

In [7]:
y_pred = regressor.predict(X_test)

## 7.  Métricas de rendimiento
Comparamos el set de test con el predicho

In [8]:
# R2score
print('R2score:', r2_score(y_test, y_pred))

# Mean Absolute Error
print('MAE: \t', mean_absolute_error(y_test, y_pred))

# Mean Squared Error
print('MSE: \t', mean_squared_error(y_test, y_pred))

# Root Mean Squared Error
print("RMSE: ", mean_squared_error(y_test, y_pred, squared=False))

R2score: 0.6447531391961585
MAE: 	 8.44677689264021
MSE: 	 140.89567646731777
RMSE:  11.869948461021968


### 7.1. Para ganar más intuición

In [9]:
np.set_printoptions(precision=2) # redondea a dos decimales

print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[86.01 79.86]
 [35.25 45.23]
 [38.99 34.51]
 ...
 [45.63 34.51]
 [52.13 52.89]
 [58.18 65.14]]


### 7.2 Más métricas de rendimiento

In [10]:
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 66.46 %
Standard Deviation: 1.53 %


Vemos que la precision y los errores no son muy buenos, por lo que probaremos otros parámetros para ver si ofrece alguno más óptimo
### 7.3 Grid Search

In [11]:
parameters = {
    'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
    'fit_intercept':[True, False],
    'normalize':[True, False]
}

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator = regressor,
                           param_grid = parameters,
                           # scoring = 'neg_mean_absolute_error',
                           cv = cv,
                           n_jobs = -1)

grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 66.48 %
Accuracy: 66.46 %
Best Parameters: {'alpha': 0.1, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


960 fits failed out of a total of 7680.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\aleja\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aleja\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 1011, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "C:\Users\aleja\anaconda3\lib\site-packages\sklearn\linear_model\_ridge.py", line 705, in fit
    raise ValueError(
ValueError: 'lbfgs' solver can be used only when positive=True. Please use another solver.

   0.66   0.66   0.66   0.66   0.66  

## 8. Guardar el mejor modelo

In [None]:
with open('Modelos/Ridge_Regressor.pkl','wb') as f:
    pickle.dump(grid_search,f)

## 9. Ejemplo de predicción

In [12]:
print(regressor.predict(sc.transform([['425060.955125', '4.606138e+06', '22']]))) 

[6.32e+09]


## 9. Obtener los coeficientes de la regresión lineal múltiple

In [13]:
print(regressor.coef_)
print(regressor.intercept_)

[-4.79 15.19 -1.02]
67.02454557070422
