<a href="https://colab.research.google.com/github/LochanaBandara03/ML_tutorial/blob/main/scikit_learn_cover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Standard Library Imports

In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


We'll use 2 datasets for demonstration purposes.

1. heart_disease - a classification dataset (predicting whether someone has heart disease or not)
2. boston_df - a regression dataset (predicting the median house prices of cities in Boston)

In [16]:
#Classification data
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")

#Regression data
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing() #load as dictionary

#Convert dictionary as dataframe
boston_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
boston_df["target"] = pd.Series(housing["target"])
boston_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


##1. Get the data ready

In [17]:
#split the data into X(features) and y(labels)
X = heart_disease.drop("target", axis=1) #use all columns except target
y = heart_disease["target"] #We want predict y using X

In [18]:
#Split the data into training set and test sets
from sklearn.model_selection import train_test_split

#example use case
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)


##2.Pick a model or estimator(suitable for the problem)

In [19]:
#Random Forest Classifier (for classification problems)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [20]:
#Random forest regressir (for regression problem)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

##3. Fit the model to the data and prediction

In [21]:
#All model/estimators have fit() function built-in
clf.fit(X_train, y_train)

#Then can make predictions using predict()
y_preds = clf.predict(X_test)

#Also make predictions with probabilities - classification problems
y_probs = clf.predict_proba(X_test)

#Veiw predictions and probabilites
y_preds, y_probs


(array([0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([[0.55, 0.45],
        [0.38, 0.62],
        [0.1 , 0.9 ],
        [0.16, 0.84],
        [0.01, 0.99],
        [0.2 , 0.8 ],
        [0.83, 0.17],
        [0.88, 0.12],
        [0.22, 0.78],
        [0.43, 0.57],
        [0.32, 0.68],
        [0.43, 0.57],
        [0.85, 0.15],
        [0.59, 0.41],
        [0.6 , 0.4 ],
        [0.68, 0.32],
        [0.96, 0.04],
        [0.03, 0.97],
        [0.9 , 0.1 ],
        [0.93, 0.07],
        [0.78, 0.22],
        [0.96, 0.04],
        [0.34, 0.66],
        [0.62, 0.38],
        [0.7 , 0.3 ],
        [0.75, 0.25],
        [0.78, 0.22],
        [0.08, 0.92],
        [0.95, 0.05],
        [0.4 , 0.6 ],
        [0.99, 0.01],
        [0.89, 0.11],
        [0.44, 0.56],
        [0.6 , 0.4 ],
        [0.99, 0.01],
        [0.96, 0.04],

##4. Evalute the mode

In [22]:
#All model/estimators have score() function built in
clf.score(X_test, y_test)

0.9016393442622951

In [23]:
#Evaluating a model using cross-validation
from sklearn.model_selection import cross_val_score

#Scoring= None means default score() metric is used
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, #use 5-fold cross-validation
                      scoring=None))

#Evaluate a model with a different scoring method
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, # use 5-fold cross validation
                      scoring="precision"))

[0.81967213 0.8852459  0.78688525 0.8        0.75      ]
[0.81081081 0.90625    0.83870968 0.81818182 0.76315789]


In [26]:
#Difference classification metrics

#Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_preds))

#Reciever Operatinng Characteristics (ROC Curve)/Area under curve(AUC)
from sklearn.metrics import roc_curve, roc_auc_score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_probs[:,1])
print(roc_auc_score(y_test,y_preds))

#confusion matrix
from  sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_preds))

#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))


0.9016393442622951
0.9026881720430109
[[26  5]
 [ 1 29]]
              precision    recall  f1-score   support

           0       0.96      0.84      0.90        31
           1       0.85      0.97      0.91        30

    accuracy                           0.90        61
   macro avg       0.91      0.90      0.90        61
weighted avg       0.91      0.90      0.90        61



In [28]:
#Difference regression metrics

#Make predictions first
X = boston_df.drop("target", axis=1)
y = boston_df["target"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train,y_train)
y_preds = model.predict(X_test)

#R squared or coefficient of determination
from sklearn.metrics import r2_score
print(r2_score(y_test, y_preds))

#Mean absolute error (MAE)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_preds))

#Mean square error (MSE)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_preds))

0.8117221346074187
0.3231833537790699
0.24610767502905


##5. Improve through experimentation


In [29]:
#How to find a model's hyperparameters
clf = RandomForestClassifier()
clf.get_params() #List of adjustable hyperparamters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [30]:
#Adjusting hyperparameters by hand

#Split data into X(features) and y(labels)
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

#Split data into train and test sets
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,y)

#Instantiate two models with different settings
clf_1 = RandomForestClassifier(n_estimators=100)
clf_2 = RandomForestClassifier(n_estimators=200)

#Fit both models
clf_1.fit(X_train,y_train)
clf_2.fit(X_train,y_train)

#Evaluate both models on test data
print(clf_1.score(X_test, y_test))
print(clf_2.score(X_test, y_test))

0.8289473684210527
0.8421052631578947


In [31]:
#Adjusting hyperparameters computationally (recommend)

from sklearn.model_selection import RandomizedSearchCV

#Define a grid of hyperparameters
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

#Split into train and test sets
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

#Set n_jobs to 1 to use all cores
clf = RandomForestClassifier(n_jobs=1)

#setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                             param_distributions=grid,
                             n_iter=10, #Try 10 models total
                             cv=5, #5-fold cross-validation
                             verbose=2) #print out results

#Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);

#Find best hyperparameters
print(rs_clf.best_params_)

#Scoring automatically using best hyperparameters
rs_clf.score(X_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   5.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.7s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.8s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

0.8688524590163934

##6. Save and reload your trained model

In [32]:
#Saving model with pickle
import pickle

#Save an existing model to file
pickle.dump(rs_clf, open("rs_random_forest_model_1.pkl", 'wb'))

In [33]:
#Load s saved pickle model
Loaded_pickle_model = pickle.load(open("rs_random_forest_model_1.pkl", "rb"))

#Evaluate loaded model
Loaded_pickle_model.score(X_test, y_test)

0.8688524590163934

In [34]:
#Saving a model with joblib
from joblib import dump, load

#Save a model to file
dump(rs_clf, filename="gs_random_forest_model_1.joblib")

['gs_random_forest_model_1.joblib']

In [35]:
#import a saved joblib model
loaded_joblib_model= load(filename="gs_random_forest_model_1.joblib")

In [36]:
#Evaluate joblib predictions
loaded_joblib_model.score(X_test, y_test)

0.8688524590163934