# 1.1 - Intro Machine Learning - Aprendizaje Supervisado - Regresion

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import pylab as plt
import seaborn as sns

%matplotlib inline

### 1. Obtener Datos
### 2. Definir Objetivo
### 3. Limpieza de Datos
    * Valores nulos
    * Datos inconsistentes
    * Datos duplicados...


### 4. Buscar colinealidad



In [None]:
#MAPA DE CALOR

plt.figure(figsize=(15, 10))

sns.set(style='white')

mask=np.triu(np.ones_like(df.corr(), dtype=bool))    # mask for superior triangular matrix

cmap=sns.diverging_palette(0, 10, as_cmap=True)      # color palette


sns.heatmap(df.corr(),                               # data
           mask=mask,                                # white mask
          cmap=cmap,                                 # color
          center=0,                                  # plot center
          square=True,                               # data square representation
          linewidths=0.5,                            # linewidth
          cbar_kws={'shrink': 0.5});                 # lateral bar legend

#MATRIZ SCATTER

pd.plotting.scatter_matrix(df, figsize=(15, 10), alpha=0.2);

#COMPARACIÓN DE 2 VARIABLES

plt.figure(figsize=(15,10))

plt.scatter(df.price, df.carat)
plt.xlabel('price')
plt.ylabel('carat');

#TRANSFORMAR VARIABLE CON BOXCOX

from scipy.stats import boxcox

plt.figure(figsize=(15,10))


carat_boxcox=boxcox(df.carat, lmbda=2.618033)

plt.scatter(df.price, carat_boxcox)
plt.xlabel('price')
plt.ylabel('carat_boxcox');


###  5. Separación de datos

Antes de transformar definitivamente nada, vamos a separar los datos en X e y.

y será la columna objetivo, que nunca se toca, nunca se transforma en ningún sentido. 
X serán el resto de columnas, la características con las que realizaremos nuestras predicciones.


In [None]:
X=df.drop('price', axis=1)

y=df.price

X.head()

In [None]:
#TRANSFORMAR VARIABLE CON BOXCOX

X.carat=boxcox(X.carat, lmbda=2.618033)

#BORRAR COLUMNA POR POCA CORRELACIÓN CON y

X=X.drop('table', axis=1)

#NORMALIZACIÓN COLUMNAS?

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

X[['depth', 'x', 'y', 'z']]=scaler.fit_transform(X[['depth', 'x', 'y', 'z']])

############################

data_num=pd.DataFrame(StandardScaler().fit_transform(df._get_numeric_data()),  # standardize numeric columns
                      columns=df._get_numeric_data().columns)

data_obj=df.select_dtypes(include='object')  # get categoric columns


data=pd.concat([data_num, data_obj], axis=1)   # concatenate both dataframes



### 6. Transformar variables categóricas 

In [None]:
# one-hot encoding, variables dummies

X=pd.get_dummies(X, columns=['cut'], drop_first=True)


# label encoder

from sklearn.preprocessing import LabelEncoder

X.clarity=LabelEncoder().fit_transform(X.clarity)

######################################

le=LabelEncoder()

for c in data.columns:
    
    if data.dtypes[c]==object:
        
        le.fit(data[c].astype(str))
        
        data[c]=le.transform(data[c].astype(str))
        

# label encoder custom, aqui entra mi conocimiento experto

color={'J':1, 'I':4, 'H':10, 'G':15, 'F':25, 'E':45, 'D':67}


X.color=X.color.apply(lambda x: color[x])


### 7. Dividir muestra entre train y test

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, train_size=0.8,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

### 8. Crear modelos predictivos

##### Regresión Lineal

In [None]:
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet

# se inician los modelos
linreg=LinReg()
lasso=Lasso()
ridge=Ridge()
elastic=ElasticNet()

##### SVR

In [None]:
from sklearn.svm import SVR  # support vector regressor

svr=SVR()

##### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR  
from sklearn.tree import ExtraTreeRegressor as ETR

rfr=RFR()
etr=ETR()

##### Boosting

In [None]:
#!pip install xgboost
#!pip install catboost
#!pip install lightgbm

from sklearn.ensemble import GradientBoostingRegressor as GBR

from xgboost import XGBRegressor as XGBR

from catboost import CatBoostRegressor as CTR

from lightgbm import LGBMRegressor as LGBMR


gbr=GBR()
xgbr=XGBR()
ctr=CTR()
lgbmr=LGBMR()

### 9. Entrenamiento

##### Regresión Lineal

In [None]:
linreg.fit(X_train, y_train)
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
elastic.fit(X_train, y_train)

##### SVR

In [None]:
svr.fit(X_train, y_train)

##### Random Forest

In [None]:
rfr.fit(X_train, y_train)
etr.fit(X_train, y_train)

##### Boosting

In [None]:
gbr.fit(X_train, y_train)
xgbr.fit(X_train, y_train)
ctr.fit(X_train, y_train, verbose=0)
lgbmr.fit(X_train, y_train)

##### A cuchillo

In [None]:
modelos=[linreg, lasso, xgbr, lgbmr]

for m in modelos:
    m.fit(X_train, y_train)

### 10. Predicción

##### Regresión Lineal

In [None]:
linreg.predict(X_test)[:10]
lasso.predict(X_test)[:10]
ridge.predict(X_test)[:10]
elastic.predict(X_test)[:10]

##### SVR

In [None]:
svr.predict(X_test)[:10]

##### Random Forest

In [None]:
rfr.predict(X_test)[:10]
etr.predict(X_test)[:10]

##### Boosting

In [None]:
gbr.predict(X_test)[:10]
xgbr.predict(X_test)[:10]
ctr.predict(X_test)[:10]
lgbmr.predict(X_test)[:10]

### 11. Evaluación

In [None]:
from sklearn.metrics import mean_squared_error as mse  # error cuadratico medio

##### Regresión Lineal

In [None]:
y_pred=linreg.predict(X_test)

mse(y_test, y_pred, squared=False)  # RMSE

##### SVR

In [None]:
y_pred=svr.predict(X_test)

mse(y_test, y_pred, squared=False)  # RMSE

##### Random Forest

In [None]:
y_pred=rfr.predict(X_test)

mse(y_test, y_pred, squared=False)  # RMSE

##### Boosting

In [None]:
y_pred=xgbr.predict(X_test)

mse(y_test, y_pred, squared=False)  # RMSE


y_pred=ctr.predict(X_test)

mse(y_test, y_pred, squared=False)  # RMSE

### 12. H2O

In [None]:
#!pip install h2o

import h2o
from h2o.automl import H2OAutoML

##### Inicializamos modelo H2O

In [None]:
h2o.init()

##### Parsear datos para H2O

In [None]:
h2train=h2o.H2OFrame(train)
h2test=h2o.H2OFrame(test)

h2train.columns


X=[c for c in h2train.columns if c!='price']

y='price'

##### Iniciar auto-machine-learning

In [None]:
automl=H2OAutoML(max_models=50,
                 seed=42,   # random_state
                 max_runtime_secs=300,
                 sort_metric='RMSE')

##### Entrenamiento

In [None]:
automl.train(x=X,
             y=y,
             training_frame=h2train)

In [None]:
print('[INFO] Leader board:')

leader_board=automl.leaderboard

##### Predicción del líder

In [None]:
y_pred=automl.leader.predict(h2test)

### 13. Regresión logística

In [None]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression(max_iter=2000)


logreg.fit(X_train, y_train)


y_pred=logreg.predict(X_test)

y_pred[:10]


y_prob=logreg.predict_proba(X_test)

y_prob[:10]


y_prob=[e[1] for e in logreg.predict_proba(X_test)]

[0 if e<0.3 else 1 for e in  y_prob][:10]  # 1 si prob>0.3

### 14. Más modelos

In [None]:
# Bayes
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.naive_bayes import ComplementNB as CNB
from sklearn.naive_bayes import BernoulliNB as BNB

# knn
from sklearn.neighbors import KNeighborsClassifier as KNNC

# boosting
from sklearn.ensemble import GradientBoostingClassifier as GBC