# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [127]:
import pandas as pd

## Regression Model Evaluation

In [128]:
from sklearn.datasets import load_boston

data = load_boston()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=['MEDV'])

data = pd.concat([X, y], axis=1)

In [129]:
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [130]:
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

X_train1, X_test1, y_train1, y_test1 = tts(X, y, test_size=0.2)

## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [131]:
lr = LinearRegression()

In [132]:
lr.fit(X_train1, y_train1)

LinearRegression()

In [133]:
y_pred1 = lr.predict(X_test1)
y_train_pred1 = lr.predict(X_train1)

## 3. Calculate and print R-squared for both the training and the testing set.

In [134]:
from sklearn import metrics

print('Test data - R2 - Coeficiente de Determinacion', metrics.r2_score(y_test1, y_pred1))
print('Train data - R2 - Coeficiente de Determinacion', metrics.r2_score(y_train1, y_train_pred1))

Test data - R2 - Coeficiente de Determinacion 0.7325898242794816
Train data - R2 - Coeficiente de Determinacion 0.7400343872343246


## 4. Calculate and print mean squared error for both the training and the testing set.

In [135]:
print('Test data - MSE - Error Cuadratico Medio', metrics.mean_squared_error(y_test1, y_pred1))
print('Train data - MSE - Error Cuadratico Medio', metrics.mean_squared_error(y_train1, y_train_pred1))

Test data - MSE - Error Cuadratico Medio 24.572539262297905
Train data - MSE - Error Cuadratico Medio 21.40325411639266


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [136]:
print('Test data - MAE - Error Medio Absoluto', metrics.mean_absolute_error(y_test1, y_pred1))
print('Train data - MAE - Error Medio Absoluto', metrics.mean_absolute_error(y_train1, y_train_pred1))

Test data - MAE - Error Medio Absoluto 3.4889936509961474
Train data - MAE - Error Medio Absoluto 3.2275461760991093


## Classification Model Evaluation

In [137]:
from sklearn.datasets import load_iris


data = load_iris()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["class"])

data = pd.concat([X, y], axis=1)

In [138]:
X.shape

(150, 4)

In [139]:
# I had to corrected this in order to proceed:

y = y["class"]

In [140]:
data["class"].value_counts()

0    50
1    50
2    50
Name: class, dtype: int64

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [141]:
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

X_train2, X_test2, y_train2, y_test2 = tts(X, y, test_size=0.2)

## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [142]:
log = LogisticRegression(max_iter=10000)

In [143]:
X_train2.shape

(120, 4)

In [144]:
# Is hugely important to give to the prediction variable or variable target (y) a Pandas Series!

y_train2.shape

(120,)

In [145]:
log.fit(
    X=X_train2, 
    y=y_train2
)

LogisticRegression(max_iter=10000)

#### Let's make new dataframes with the contrast of the prediction and the real value:

In [146]:
test2 = pd.DataFrame(y_test2)

In [125]:
test2.sample(5)

Unnamed: 0,class
67,1
101,2
83,1
14,0
124,2


In [147]:
test2["prediction"] = log.predict(X_test2)

In [149]:
test2.sample(10)

Unnamed: 0,class,prediction
51,1,1
38,0,0
77,1,2
88,1,1
74,1,1
15,0,0
6,0,0
124,2,2
12,0,0
147,2,2


In [150]:
train2 = pd.DataFrame(y_train2)
train2["prediction"] = log.predict(X_train2)

## 8. Calculate and print the accuracy score for both the training and the testing set.

In [153]:
test2["correct"] = (test2["class"] == test2["prediction"])

In [157]:
accuracy_test2 = test2.correct.sum() / test2.shape[0]
print(f"The accuracy of the test dataframe is: {accuracy_test2.round(3)}")

The accuracy of the test dataframe is: 0.967


In [158]:
train2["correct"] = (train2["class"] == train2["prediction"])

accuracy_train2 = train2.correct.sum() / train2.shape[0]
print(f"The accuracy of the test dataframe is: {accuracy_train2.round(3)}")

The accuracy of the test dataframe is: 0.975


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [175]:
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score

In [170]:
balanced_precision_test = balanced_accuracy_score(
    y_true=test2["class"],
    y_pred=test2.prediction
)
print(f"The balanced precision from the test dataframe is: {precision_test}")

The balanced precision from the test dataframe is: 0.9666666666666667


In [171]:
balanced_precision_train = balanced_accuracy_score(
    y_true=train2["class"],
    y_pred=train2.prediction
)
print(f"The balanced precision from the train dataframe is: {precision_train}")

The balanced precision from the train dataframe is: 0.975


## 10. Calculate and print the precision score for both the training and the testing set.

In [169]:
precision_test = precision_score(
    average="micro",
    y_true=test2["class"],
    y_pred=test2.prediction
)
print(f"The precision from the test dataframe is: {precision_test}")

The precision from the test dataframe is: 0.9666666666666667


In [168]:
precision_train = precision_score(
    average="micro",
    y_true=train2["class"],
    y_pred=train2.prediction
)
print(f"The precision from the train dataframe is: {precision_train}")

The precision from the train dataframe is: 0.975


## 11. Calculate and print the recall score for both the training and the testing set.

In [173]:
recall_test = recall_score(
    average="micro",
    y_true=test2["class"],
    y_pred=test2.prediction
)
print(f"The recall from the test dataframe is: {recall_test}")

The recall from the test dataframe is: 0.9666666666666667


In [174]:
recall_train = recall_score(
    average="micro",
    y_true=train2["class"],
    y_pred=train2.prediction
)
print(f"The recall from the train dataframe is: {recall_train}")

The recall from the train dataframe is: 0.975


## 12. Calculate and print the F1 score for both the training and the testing set.

In [177]:
f1_test = f1_score(
    average="micro",
    y_true=test2["class"],
    y_pred=test2.prediction
)
print(f"The f1 score from the test dataframe is: {f1_test}")

The f1 score from the test dataframe is: 0.9666666666666667


In [178]:
f1_train = f1_score(
    average="micro",
    y_true=train2["class"],
    y_pred=train2.prediction
)
print(f"The f1 score from the train dataframe is: {f1_train}")

The f1 score from the train dataframe is: 0.975


## 13. Generate confusion matrices for both the training and the testing set.

In [184]:
from sklearn.metrics import confusion_matrix

#### Confusion matrix of the test dataset:

In [186]:
pd.crosstab(
    test2["class"],
    test2.prediction
)

prediction,0,1,2
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,11,0,0
1,0,13,1
2,0,0,5


#### Confusion matrix of the train dataset:

In [185]:
pd.crosstab(
    train2["class"],
    train2.prediction
)

prediction,0,1,2
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,39,0,0
1,0,34,2
2,0,1,44


## Bonus: For each of the data sets in this lab, try training with some of the other models you have learned about, recalculate the evaluation metrics, and compare to determine which models perform best on each data set.