# 2.1 - Evaluación Supervisado

### Regresión

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.ensemble import RandomForestRegressor as RFR    # el modelo , el alias es cosa mia

from sklearn.model_selection import train_test_split as tts  # el alias es cosa mia

from sklearn.datasets import load_diabetes   # dataset

In [None]:
data = load_diabetes()

In [None]:
data['DESCR'].split('\n')

In [None]:
df = pd.DataFrame(data['data'], columns = data['feature_names'])

df['target'] = data['target']

df.head()

In [None]:
X_train, X_test, y_train, y_test = tts(data['data'], data['target'], train_size=0.75)


X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# modelo 

rfr = RFR()    # inicia el objeto random forest

rfr.fit(X_train, y_train)      # entrena el modelo
 
y_pred = rfr.predict(X_test)   # prediccion con el tamaño del test

In [None]:
y_pred[:10]

In [None]:
X_test.shape

In [None]:
y_test.shape, y_pred.shape

In [None]:
y_test.mean(), y_test.std()

###### MSE


$$MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i-\hat{y}_i)^{2}$$


pertenece al intervalo [0, +$\infty$)

In [None]:
from sklearn.metrics import mean_squared_error as mse    # alias mio


mse(y_test, y_pred)

###### RMSE


$$RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i-\hat{y}_i)^{2}}$$


pertenece al intervalo [0, +$\infty$)

In [None]:
mse(y_test, y_pred, squared=False)

In [None]:
mse(y_test, y_pred) ** 0.5

###### RMSLE


$$RMSLE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(log(y_i)-log(\hat{y}_i))^{2}}$$


pertenece al intervalo [0, +$\infty$)

In [None]:
from sklearn.metrics import mean_squared_log_error as msle

msle(y_test, y_pred, squared=False)

###### MAE


$$MAE = \frac{1}{n}\sum_{i=1}^{n}|y_i-\hat{y}_i|$$


pertenece al intervalo [0, +$\infty$)

In [None]:
from sklearn.metrics import mean_absolute_error as mae

mae(y_test, y_pred)

$$MAE <= RMSE <= MAE · \sqrt{N}$$

###### R2


$$R2 = 1 - \frac{\sum_{i=1}^{n}(y_i-\hat{y}_i)^{2}}{\sum_{i=1}^{n}(y_i-\bar{y})^{2}}$$

###### Adjusted R2

$$AdjustedR2 = 1-(1-R2)\frac{n-1}{n-p-1}$$


donde:
+ n = tamaño de la muestra
+ p = nº de variables del modelo


pertenecen al intervalo (-$\infty$, 1]

In [None]:
rfr.score(X_test, y_test)    # R2, calcula internamente la prediccion, viene del modelo

In [None]:
from sklearn.metrics import r2_score as r2

r2(y_test, y_pred)

In [None]:
def r2_ajustado(r_2):
    
    adj = 1 - (1-r_2) * (X_test.shape[0]-1) / (X_test.shape[0]-X_test.shape[1]-1)
    
    return adj

In [None]:
r2_ajustado(r2(y_test, y_pred))

### Clasificación

In [None]:
data=pd.read_csv('../data/churn.csv')

data=data.dropna()

y=data.Churn.apply(lambda x: 1 if x=='Yes' else 0)

data=data.drop(columns=['customerID', 'ChurnBinary', 'Churn'])

data=pd.get_dummies(data)

X=data.copy()

X.head()

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# modelo 1 

from sklearn.linear_model import LogisticRegression as LogReg

logreg = LogReg()

logreg.fit(X_train, y_train)

y_pred_logreg = logreg.predict(X_test)    # devuelve la clase, la etiqueta 0-1

y_prob = logreg.predict_proba(X_test)     # devuelve probabilidades

y_pred_logreg[:4]

In [None]:
y_prob[:4]

In [None]:
# modelo 2

from sklearn.ensemble import RandomForestClassifier as RFC


rfc = RFC()

rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)


y_pred_rfc[:4]

+ TP := True Positive (aciertos clase 1)
+ TN := True Negative (aciertos clase 0)
+ FP := False Positive (Error tipo I, decir 1 cuando es 0)
+ FN := False Negative (Error tipo II, decir 0 cuando es 1)

+ Accuracy  := (TP+TN)/(TP+TN+FP+FN) (acierto)  ($\frac{1}{n}\sum 1(\hat{y_i}=y_i$))
+ Precision := TP/(TP+FP)
+ Recall    := TP/(TP+FN)  (Sensibilidad, TPR)
+ F1_Score  := 2·Recall·Precision/(Recall+Precision)

(F1 funciona mejor que el accuracy cuando los datos no están balanceados y cuando FP y FN son muy diferentes)

![f1](images/f1.png)

##### Accuracy

In [None]:
y_test.value_counts()

In [None]:
logreg.score(X_test, y_test)  # accuracy, acierto

In [None]:
from sklearn.metrics import accuracy_score as acc

acc(y_test, y_pred_logreg)

In [None]:
rfc.score(X_test, y_test)

In [None]:
acc(y_test, y_pred_rfc)

##### Precision

In [None]:
from sklearn.metrics import precision_score as prec

prec(y_test, y_pred_logreg)

In [None]:
prec(y_test, y_pred_rfc)

##### Recall

In [None]:
from sklearn.metrics import recall_score as rec

rec(y_test, y_pred_logreg)

In [None]:
rec(y_test, y_pred_rfc)

##### F1_Score

In [None]:
from sklearn.metrics import f1_score as f1

f1(y_test, y_pred_logreg)

In [None]:
f1(y_test, y_pred_rfc)

##### Matriz de Confusión

![conf_matrix](images/conf_matrix.jpeg)

In [None]:
from sklearn.metrics import confusion_matrix as cm


cm(y_test, y_pred_logreg)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
plt.figure(figsize=(15, 8))

ax=sns.heatmap(cm(y_test, y_pred_rfc)/cm(y_test, y_pred_rfc).sum(), annot=True)

plt.title('Matriz confusion')
plt.ylabel('Verdad')
plt.xlabel('Prediccion')
plt.show();

##### ROC-AUC  (Característica operativa del receptor y área debajo de la curva)

+ TPR := TP/(TP+FN)
+ FPR := FP/(TN+FP)


![roc](images/roc.png)

In [None]:
from sklearn.metrics import roc_curve as roc

from sklearn.metrics import roc_auc_score as auc

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.figure(figsize=(12, 6))

fpr, tpr, _ = roc(y_test, y_pred_logreg)   # por esto esta mal pintao

a=auc(y_test, y_pred_logreg)

plt.plot(fpr, tpr)

plt.plot(fpr, fpr, 'r--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.title('Binary ROC CURVE - AUC={:.3f}'.format(a))

plt.show();

In [None]:
y_prob = logreg.predict_proba(X_test)[::, 1]

y_prob[:5]

In [None]:
logreg.predict_proba(X_test)[:3]

In [None]:
# BIEN PINTAO, con probabilidad

plt.figure(figsize=(12, 6))

fpr, tpr, _ = roc(y_test, y_prob)   

a=auc(y_test, y_prob)

plt.plot(fpr, tpr)

plt.plot(fpr, fpr, 'r--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.title('Binary ROC CURVE - AUC={:.3f}'.format(a))

plt.show();


**Probabilidad de que tu clasificador sea mejor que un clasificador random - Kappa de Cohen**

https://es.wikipedia.org/wiki/Coeficiente_kappa_de_Cohen

In [None]:
from sklearn.metrics import cohen_kappa_score as kappa

In [None]:
kappa(y_test, y_pred_logreg)

In [None]:
# multiclase

In [None]:
from sklearn.datasets import load_wine

X=load_wine().data

y=load_wine().target

X_train, X_test, y_train, y_test = tts(X, y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
load_wine()['DESCR'].split('\n')

In [None]:
rfc=RFC()

rfc.fit(X_train, y_train)

y_pred=rfc.predict(X_test)

y_prob_rfc=rfc.predict_proba(X_test)

In [None]:
from sklearn.svm import SVC

svc=SVC(probability=True)

svc.fit(X_train, y_train)

y_prob=svc.predict_proba(X_test)

In [None]:
%pip install scikit-plot

In [None]:
import scikitplot as skplt

skplt.metrics.plot_roc(y_test, y_prob_rfc, figsize=(15, 8));

In [None]:
skplt.metrics.plot_roc(y_test, y_prob, figsize=(15, 8));