<a href="https://colab.research.google.com/github/LochanaBandara03/ML_tutorial/blob/main/scikit_learn_cover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Standard Library Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


We'll use 2 datasets for demonstration purposes.

1. heart_disease - a classification dataset (predicting whether someone has heart disease or not)
2. boston_df - a regression dataset (predicting the median house prices of cities in Boston)

In [3]:
#Classification data
heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")

#Regression data
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing() #load as dictionary

#Convert dictionary as dataframe
boston_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
boston_df["target"] = pd.Series(housing["target"])
boston_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


##1. Get the data ready

In [4]:
#split the data into X(features) and y(labels)
X = heart_disease.drop("target", axis=1) #use all columns except target
y = heart_disease["target"] #We want predict y using X

In [6]:
#Split the data into training set and test sets
from sklearn.model_selection import train_test_split

#example use case
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)


##2.Pick a model or estimator(suitable for the problem)

In [7]:
#Random Forest Classifier (for classification problems)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [8]:
#Random forest regressir (for regression problem)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

##3. Fit the model to the data and prediction

In [9]:
#All model/estimators have fit() function built-in
clf.fit(X_train, y_train)

#Then can make predictions using predict()
y_preds = clf.predict(X_test)

#Also make predictions with probabilities - classification problems
y_probs = clf.predict_proba(X_test)

#Veiw predictions and probabilites
y_preds, y_probs


(array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0]),
 array([[0.16, 0.84],
        [0.08, 0.92],
        [0.77, 0.23],
        [0.24, 0.76],
        [0.93, 0.07],
        [0.98, 0.02],
        [0.38, 0.62],
        [0.63, 0.37],
        [0.61, 0.39],
        [0.94, 0.06],
        [0.25, 0.75],
        [0.4 , 0.6 ],
        [0.03, 0.97],
        [0.04, 0.96],
        [0.94, 0.06],
        [0.22, 0.78],
        [0.49, 0.51],
        [0.87, 0.13],
        [0.94, 0.06],
        [0.12, 0.88],
        [0.31, 0.69],
        [0.31, 0.69],
        [0.12, 0.88],
        [0.97, 0.03],
        [0.94, 0.06],
        [0.22, 0.78],
        [0.32, 0.68],
        [0.3 , 0.7 ],
        [0.88, 0.12],
        [0.26, 0.74],
        [0.42, 0.58],
        [0.02, 0.98],
        [0.25, 0.75],
        [0.95, 0.05],
        [0.31, 0.69],
        [0.72, 0.28],

##4. Evalute the mode

In [10]:
#All model/estimators have score() function built in
clf.score(X_test, y_test)

0.819672131147541

In [14]:
#Evaluating a model using cross-validation
from sklearn.model_selection import cross_val_score

#Scoring= None means default score() metric is used
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, #use 5-fold cross-validation
                      scoring=None))

#Evaluate a model with a different scoring method
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, # use 5-fold cross validation
                      scoring="precision"))

[0.85245902 0.90163934 0.81967213 0.8        0.73333333]
[0.82857143 0.93548387 0.83870968 0.8        0.74358974]


In [None]:
#Difference classification metrics

#Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_preds))

#Reciever Operatinng Characteristics