https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb


# Tutorial 3: End to End Machine Learning  

## Setup

In [1]:
import sys
assert sys.version_info >= (3, 5)
# Python ≥3.5 is required

In [2]:
# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

In [3]:
# Common imports
import numpy as np
import os
import tarfile
import urllib
import pandas as pd

In [4]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

In [5]:
import pandas as pd

# Download the Data

In [6]:
url= "brain_stroke_in_numeric_without_useless_data.csv"
brain = pd.read_csv(url)
brain = brain.drop(brain.columns[0], axis=1)

## A Quick Look at the Data Structure

In [7]:
brain.head(100) 

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,228.69,36.6,2,1
1,0,80.0,0,1,1,105.92,32.5,1,1
2,1,49.0,0,0,1,171.23,34.4,3,1
3,1,79.0,1,0,1,174.12,24.0,1,1
4,0,81.0,0,0,1,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...
95,0,71.0,1,1,0,216.94,30.9,1,1
96,0,61.0,1,0,1,76.11,27.3,3,1
97,0,74.0,0,0,1,72.96,31.3,3,1
98,1,38.0,0,0,0,82.28,24.0,2,1


Each row represents one district.

In [8]:
#This method prints information about a DataFrame including the dtype and columns, non-null values and memory usage.
brain.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   int64  
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   int64  
 5   avg_glucose_level  4981 non-null   float64
 6   bmi                4981 non-null   float64
 7   smoking_status     4981 non-null   int64  
 8   stroke             4981 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 350.4 KB


In [9]:
#This method shows a statistical summary of the numerical attributes 
brain.describe()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,0.583618,43.419859,0.096165,0.05521,0.658502,105.943562,28.498173,1.184501,0.049789
std,0.493008,22.662755,0.294848,0.228412,0.47426,45.075373,6.790464,1.031769,0.217531
min,0.0,0.08,0.0,0.0,0.0,55.12,14.0,0.0,0.0
25%,0.0,25.0,0.0,0.0,0.0,77.23,23.7,0.0,0.0
50%,1.0,45.0,0.0,0.0,1.0,91.85,28.1,1.0,0.0
75%,1.0,61.0,0.0,0.0,1.0,113.86,32.6,2.0,0.0
max,1.0,82.0,1.0,1.0,1.0,271.74,48.9,3.0,1.0


# Prepare the Data for Machine Learning Algorithms

In [10]:
#Provides train/test indices to split data in train/test sets.
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [11]:
for train_index, test_index in split.split(brain, brain["stroke"]):
    strat_train_set = brain.loc[train_index]
    strat_test_set = brain.loc[test_index]

In [12]:
# def stroke_proportions(data):
#     return data["stroke"].value_counts() / len(data)

# train_set, test_set = train_test_split(brain, test_size=0.2, random_state=42)

# compare_props = pd.DataFrame({
#     "Overall": stroke_proportions(brain),
#     "Stratified": stroke_proportions(strat_test_set),
#     "Random": stroke_proportions(test_set),
# }).sort_index()
# compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
# compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [13]:
# compare_props

In [14]:
brain = strat_train_set.drop("stroke", axis=1)
brain_labels = strat_train_set["stroke"].copy()

## Custom Transformer

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
age_ix, avg_glucose_level_ix, bmi_ix = 1, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_avg_glucose_level_per_age = True): # no *args or **kargs
        self.add_avg_glucose_level_per_age = add_avg_glucose_level_per_age
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        bmi_per_age =  X[:, bmi_ix] /X[:, age_ix]
        bmi_per_avg_glucose_level =X[:, bmi_ix] / X[:, avg_glucose_level_ix]
        if self.add_avg_glucose_level_per_age:
            avg_glucose_level_per_age = X[:, avg_glucose_level_ix] / X[:, age_ix]
            return np.c_[X, bmi_per_age, bmi_per_avg_glucose_level ,
                         avg_glucose_level_per_age]
        else:
            return np.c_[X,bmi_per_age,bmi_per_avg_glucose_level]

attr_adder = CombinedAttributesAdder(add_avg_glucose_level_per_age=False)
brain_extra_attribs = attr_adder.transform(brain.values)

In [16]:
brain_extra_attribs = pd.DataFrame(
    brain_extra_attribs,
    columns=list(brain.columns)+["age_per_bmi", "avg_glucose_level_per_bmi"],
    index=brain.index)
brain_extra_attribs.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,smoking_status,age_per_bmi,avg_glucose_level_per_bmi
4428,1.0,24.0,0.0,0.0,0.0,187.99,24.9,3.0,1.0375,0.132454
1135,0.0,3.0,0.0,0.0,0.0,86.38,22.8,0.0,7.6,0.26395
2417,1.0,69.0,0.0,0.0,1.0,111.48,37.0,3.0,0.536232,0.331898
1173,1.0,46.0,0.0,0.0,1.0,127.75,30.5,1.0,0.663043,0.238748
3696,0.0,52.0,1.0,0.0,1.0,100.71,37.0,1.0,0.711538,0.367392


In [17]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3984 entries, 4428 to 2029
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             3984 non-null   int64  
 1   age                3984 non-null   float64
 2   hypertension       3984 non-null   int64  
 3   heart_disease      3984 non-null   int64  
 4   ever_married       3984 non-null   int64  
 5   avg_glucose_level  3984 non-null   float64
 6   bmi                3984 non-null   float64
 7   smoking_status     3984 non-null   int64  
dtypes: float64(3), int64(5)
memory usage: 280.1 KB


## Transformation Pipelines

`Pipeline` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html">Documentation Link</a>

There are many data transformation steps that need to be executed in the right order. Fortunately, Scikit-Learn provides the Pipeline class to help with such sequences of transformations.

`StandardScaler` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html">Documentation Link</a>

Standardization used in numerical data preparation, first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the standard deviation so that the resulting distribution has unit variance.



In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), # Dealing with missing values
        ('attribs_adder', CombinedAttributesAdder()), # New attribute
        ('std_scaler', StandardScaler()),            # Attribute normalization
    ])

brain_num_tr = num_pipeline.fit_transform(brain)

In [19]:
brain_num_tr

array([[ 0.84602735, -0.8668014 , -0.32894656, ..., -0.10586901,
        -1.46635264,  0.01649148],
       [-1.18199489, -1.79493002, -0.32894656, ...,  1.13849268,
        -0.33709707,  0.5894073 ],
       [ 0.84602735,  1.12204565, -0.32894656, ..., -0.20091797,
         0.24642413, -0.15344642],
       ...,
       [ 0.84602735,  0.41490003, -0.32894656, ..., -0.19991726,
         1.45192404, -0.16626702],
       [ 0.84602735,  0.01713062, -0.32894656, ..., -0.21080479,
        -0.48057091, -0.14409019],
       [-1.18199489,  0.06132722,  3.04000744, ..., -0.13531232,
         1.166305  , -0.14267977]])

So far, we have handled the categorical columns and the numerical columns separately. It would be more convenient to have a single transformer able to handle all columns, applying the appropriate transformations to each column.


`ColumnTransformer` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html">Documentation Link</a>


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(brain)
cat_attribs = ["bmi"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),#  Apply the numerical transformation to the all numeric columns
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs), # Apply the One Hot Encoding to the all categorical columns
    ])

brain_prepared = full_pipeline.fit_transform(brain) #Now fit_transform applies all the transformation to the data 

In [21]:
brain_prepared

<3984x348 sparse matrix of type '<class 'numpy.float64'>'
	with 47808 stored elements in Compressed Sparse Row format>

In [22]:
brain_prepared.shape

(3984, 348)

## Naive Bayes

- linear Regression
- Multinomial Naive Bayes
- Gaussian Naive Bayes
- Complement Naive Bayes
- Bernoulli Naive Bayes
- Categorical Naive Bayes


In [23]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(brain_prepared, brain_labels)

In [24]:
# let's try the full preprocessing pipeline on a few training instances
some_data = brain.iloc[:5]
some_labels = brain_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [ 0.00061495 -0.06818187  0.05534277  0.04241244  0.06122604]


In [25]:
### Multinomial Naive Bayes

In [26]:
X = brain.iloc[:,0:8]

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, brain_labels, test_size=0.5, random_state=0)

multinomialNB = MultinomialNB(alpha=1.0, class_prior=[0.4, 0.6])
y_pred = multinomialNB.fit(X_train, y_train).predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6445783132530121


In [28]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import f1_score

In [29]:
y_train_pred = cross_val_predict(multinomialNB, X_train, y_train, cv=3)
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.62      0.76      1914
           1       0.07      0.71      0.13        78

    accuracy                           0.62      1992
   macro avg       0.53      0.66      0.44      1992
weighted avg       0.95      0.62      0.74      1992



In [30]:
print(precision_score(y_train, y_train_pred, average=None))

[0.98099174 0.07033248]


In [31]:
print(recall_score(y_train, y_train_pred, average=None))

[0.62016719 0.70512821]


In [32]:
print(f1_score(y_train, y_train_pred, average=None))

[0.75992318 0.12790698]


In [33]:
### Gaussian Naive Bayes

In [34]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print("Number of mislabeled points out of a total %d points : %d"
#  % (X_test.shape[0], (y_test != y_pred).sum()))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.873995983935743


In [35]:
y_train_pred = cross_val_predict(gnb, X_train, y_train, cv=3)
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93      1914
           1       0.09      0.27      0.14        78

    accuracy                           0.87      1992
   macro avg       0.53      0.58      0.53      1992
weighted avg       0.93      0.87      0.90      1992



In [36]:
print(precision_score(y_train, y_train_pred, average=None))

[0.96777841 0.0941704 ]


In [37]:
print(recall_score(y_train, y_train_pred, average=None))

[0.89446186 0.26923077]


In [38]:
print(f1_score(y_train, y_train_pred, average=None))

[0.92967689 0.13953488]


In [39]:
### Complement Naive Bayes

In [40]:
from sklearn.naive_bayes import ComplementNB

complement = ComplementNB()
y_pred = complement.fit(X_train, y_train).predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6817269076305221


In [41]:
### Bernoulli Naive Bayes

In [42]:
from sklearn.naive_bayes import BernoulliNB

bernoulli = BernoulliNB()
y_pred = bernoulli.fit(X_train, y_train).predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9397590361445783


In [73]:
y_train_pred = cross_val_predict(bernoulli, X_train, y_train, cv=3)
print(classification_report(y_train,y_train_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1914
           1       1.00      0.00      0.00        78

    accuracy                           0.96      1992
   macro avg       0.98      0.50      0.49      1992
weighted avg       0.96      0.96      0.94      1992



In [75]:
print(precision_score(y_train, y_train_pred, average=None, zero_division=0))

[0.96084337 1.        ]


In [45]:
print(recall_score(y_train, y_train_pred, average=None))

[1. 0.]


In [46]:
print(f1_score(y_train, y_train_pred, average=None))

[0.98003072 0.        ]


In [47]:
### Categorical Naive Bayes

In [48]:
from sklearn.naive_bayes import CategoricalNB

categorical = CategoricalNB()
y_pred = categorical.fit(X_train, y_train).predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9352409638554217


In [49]:
y_train_pred = cross_val_predict(categorical, X_train, y_train, cv=3)
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1914
           1       0.16      0.06      0.09        78

    accuracy                           0.95      1992
   macro avg       0.56      0.53      0.53      1992
weighted avg       0.93      0.95      0.94      1992



In [50]:
print(precision_score(y_train, y_train_pred, average=None))

[0.96277409 0.16129032]


In [51]:
print(recall_score(y_train, y_train_pred, average=None))

[0.98641588 0.06410256]


In [52]:
print(f1_score(y_train, y_train_pred, average=None))

[0.97445161 0.09174312]


### Mean Squared Error


`mean_squared_error` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html">Documentation Link</a>

In [53]:
from sklearn.metrics import mean_squared_error

brain_predictions = lin_reg.predict(brain_prepared)
lin_mse = mean_squared_error(brain_labels, brain_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.19999149653164328

### Mean Absolute Error
`mean_absolute_error` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html">Documentation Link</a>

In [54]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(brain_labels, brain_predictions)
lin_mae

0.09949333811622021

## Evaluation Using Cross-Validation
`cross_val_score` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html">Documentation Link</a>

In [55]:
from sklearn.model_selection import cross_val_score
lin_scores = cross_val_score(lin_reg, brain_prepared, brain_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
display_scores(lin_rmse_scores)

Scores: [0.22991578 0.20961226 0.22971554 0.20512575 0.23341837 0.21552402
 0.21804296 0.22046642 0.23787548 0.19625421]
Mean: 0.21959507846669157
Standard deviation: 0.01267972119114614


# Evaluate the model with Test Set

In [56]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(brain_prepared, brain_labels)

In [57]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(brain_prepared, brain_labels)

In [58]:
from sklearn.model_selection import GridSearchCV
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("stroke", axis=1)
y_test = strat_test_set["stroke"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
range(len(brain))


range(0, 3984)

In [59]:
final_rmse

0.21323896692558544