In [1]:
# This notebook is meant for ML breaking down into details 

In [2]:
# Commonly imported ML libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

## 1. Getting our data ready to be used with machine learning 

Three main things we have to do:
    1. Split the data into features and labels (usually `X` & `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. Converting non-numerical values to numerical values (also  called **feature encoding** )

In [3]:
# importing data
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# Spiliting the data into features
X = heart_disease.drop("target", axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
# Spiling the data into label 
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
# Spiling the data training and test sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [8]:
X.shape

(303, 13)

In [9]:
len(heart_disease)

303

# Data Science Quick Tip: Clean, Transform, Reduce
- more data does not mean it is necessry good, we want useful data
- in the following we will do 
  Clean Data -> Transform Data -> Reduce Data

1. Clean Data: Remove, replace data because sometime data is missing. We usually remove row or column when a feild is empty or fill it with average. Or we will remove outlier 

2. Transform Data: Make the data that computer can understand. Transform data into numbers. For example we convert color into RGB for computer to understand

3. Reduce Data (Dimensionality reduction/ column reduction): Everything cost money, the more date the more CUP, energy and time to run the data. We can have many colunm and find the one that is relevant. 

### 1.1 Make sure it's all numerical (we will use different data set)



In [10]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [11]:
len(car_sales)

1000

In [12]:
# Split into X/y 
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

# Split into training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# Build machine learning model 
# Regressor allows us to predict number classifier is for 1/0
# Computer does not understand what car brands we need need to convert 
#them to a number
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

ValueError: could not convert string to float: 'BMW'

In [14]:
# We are going to treat door number as category
car_sales["Doors"].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

### Note
1. ramainder="passthrough" will make sure that what ever is not specificed in the ColumnTransformer will be keeped (by default it is "drop"). For instance, if we use drop odometer will be drop because it is not being specified!!!
2. in ColumnTransformer first param is a list of tuple, 

first is name; 

second is transformer; 

third is column to be transferred

In [15]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_feature)],
                                  remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [16]:
(transformed_X).shape

(1000, 13)

In [17]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


### One hot encoding explain

![](oneHotEncoding.png)

In [18]:
# Dummies is a function from panda to one hot encode 
# This does not work for numerical column
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [19]:
# Let's refit the model 
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                   y,
                                                   test_size=0.2)

model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [20]:
model.score(X_test, y_test)

0.3235867221569877

### 1.2 What is there were missing values? 
1. Fill them with some value (also known as imputation).
2. Remove the samples with missing data altogether. 

In [21]:
# Import car sales missing data 
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [22]:
# To show how many missing values in each column
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [23]:
# Ceate X & y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

### Before we cna conver our data to number we need to deal with NAN data
### Option 1: Fill missing data with Pandas

In [24]:
car_sales_missing["Doors"].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

In [25]:
# Fill the "Make" column 
car_sales_missing["Make"].fillna("missing", inplace=True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)

# Fill the "Doors" column
car_sales_missing["Doors"].fillna(4, inplace=True)

In [26]:
# Check our dataframe again 
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

#### Because it is always hard to predict something that doesnt have label. Thus in thus case we are going to to remove the column wit missing price (price is the label over here)

In [27]:
# Remove rows with missing Price value
car_sales_missing.dropna(inplace=True)

In [28]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [29]:
len(car_sales_missing)

950

In [30]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [31]:
# Let's try and convert our data to number 
# Turn the category  into numbers
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder();
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [32]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0,14043.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0,32042.0
946,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,155144.0,5716.0
947,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0,31570.0
948,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,215883.0,4001.0


### Option 2: Fill missing values with scikit-learn

**Feature Scaling** is needed when there is wide-ranging variables. For example when the valule is from 6000 to 345,000. We have 2 main types of feature scaling.
1. **Normalzation**: rescale the numerical value to 0,1. For instance, scale lower value to 0 and highest value being close to 1.
2. **Standardization**: Subtracts the mean value from all of the features, this will also turn mean to 0. It then scales the features to unit variance by dividing the feature by the standard deviation. 

!!! Note !!!
- feature scaling usually isnt required for target variable
- feature scaling is usually not required with tree-based models(Random Forest) since they can handle varying features. 

In [33]:
car_sale_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sale_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [34]:
car_sale_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [35]:
# To drop a specific column with NA (no label)
car_sale_missing.dropna(subset=["Price"], inplace=True)
car_sale_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [36]:
# Split into X & y
X = car_sale_missing.drop("Price", axis=1)
y = car_sale_missing["Price"]

# Split data into train and test 
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Note** : We use `fit_transform()` on the training data and `transform()` on the testing data. In essence, we learn the patterns in the traning set and transform it via imputation (fit, then transform). Then we take those same patterns and fill the test set (transform only).

In [37]:
# Fill missing values with Scikit-Learn
# Filling missing data is also known as imputation
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical value with "missing" & numerical value with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns 
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("car_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data 
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

# Check filled X_train
filled_X_train


array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [38]:
# Get out transformed data array's back into Dataframe
car_sales_filled_train = pd.DataFrame(filled_X_train,
                                 columns=["Make", "Colour", "Doors",
                                         "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test,
                                    columns=["Make", "Colour", "Doors",
                                            "Odometer (KM)"])


# Check missing data in training set 
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [39]:
# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

# Fill train and test values separately 
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.transform(car_sales_filled_test)

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [40]:
# Now we've got our data as numbers and filled (no missing values)
# Let's fit a model 
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split


model = RandomForestRegressor()
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)


0.21229043336119102

Recap: 
- Most datasets we come across won't be in a form ready to immediatel start using them with ML models. And some may take more preparation than other to get ready to use.
- For most machine learning models, your data has to be numerical. This will involve coverting whatever you're working with into numbers. This process is often referred to as **feature engineering** or **feature encoding**. 
- Some ML models are aren't compatible with missing data. The process of filling missing data is referred as **data imputation**

# Check list 
1. Getting Data Ready (done)
2. Choose the right estimator/algorithm for our problems 
3. Fit the model/algorithm and use it to make prediction on our data
4. Evaluating a model 
5. Improve a model 
6. Save and load a trained model
7. Putting it all together!

## 2. Choosing the right estimator/algorithm for our problem

Scilit-Learn uses estimator as antoher term for machine learning model or algorithm.

- Classification - predicting whether a sample is one thing or another 
- Regression - predicting a number

Step 1 - Check the Sciki-Learn ML map. search sklearn ML map



### 2.1 Picking a ML model for a regression problem 

In [41]:
# Import Boston housing dataset 
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [42]:
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [43]:
# How many samples? 
len(boston_df)

506

In [44]:
# Let's try the Ridge Regression model 
from sklearn.linear_model import Ridge 

# Setup random seed
np.random.seed(42)

# Create the data 
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split into train and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Ridge model 
model = Ridge()
model.fit(X_train, y_train)

# Check the score of the Ridge model on test data
model.score(X_test, y_test)

0.6662221670168522

How do we improve the score? 

What if Ridge was not working?

Let's refer to sklearn ML map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [45]:
# Let's try the Random Forst Regressor 
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

# Create the data 
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split the date 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Regressor 
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Evaluate the Random Forest Regressor 
rf.score(X_test, y_test)

0.873969014117403

# Check the Ridge model again 
model.score(X_test, y_test)

## 2.2 Choosing and estimator for a classification problem 
Let's go to the map: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### Note: Decision Tree
- Telling computer to create "if else" by itself with the data provided

In [46]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [47]:
len(heart_disease)

303

### Note: SVC 
- SVC = Support Vector Classifier 
- SVM = Support Vector Machine 
- Supurvised = with data 
Consolting the map and it says try `LinearSVC` 

In [48]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

# Setup random seed 
np.random.seed(42)

# Make the data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate LinearSVC
clf = LinearSVC(max_iter=10000)
clf.fit(X_train, y_train)

# Evaluate the LinearSVC
clf.score(X_test, y_test)



0.47540983606557374

In [49]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

- Our label is binary classification but the model is retrning the mean score
- Look at sklearn Map and try Ensemble 

In [50]:
# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier 

# Setup random seed 
np.random.seed(42)

# Make the data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate LinearSVC
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier
clf.score(X_test, y_test)

0.8524590163934426

Tidbit/tip: 
    1. if you have a structured data (a.k.a table data or dataframe), use  ensemble method such as RandomForest 
    2. If you have unstrauctured data (images or audio), use deep learning or transfer learning
    
Essence of ML: finding patterns in data & use the patterns to make predictions on future data (data a model has not seen before). 

# Check list 
1. Getting Data Ready (done)
2. Choose the right estimator/algorithm for our problems (done)
    - Linear Classification 
    - RandomForestClassifier (ensemble)
3. Fit the model/algorithm and use it to make prediction on our data
4. Evaluating a model 
5. Improve a model 
6. Save and load a trained model
7. Putting it all together!

## 3 Fit the model/algorithm on our data and use it to make prediction

### 3.1 Fitting the model into the data

Different names for:
* `X` = features, features variables, data
* `y` = lebels, targets, target variables

Note: 
- ML model take the data (fit) to figure out how those patterns relate to the label

In [51]:
# Import the RandomForestClassifier estimator class 
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42)

# Make the data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier()

# Fix the model to the data (training the ML model)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learn)
clf.score(X_test, y_test)

0.8524590163934426

In [52]:
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [53]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

### 3.2 Make predictions using a ML model

2 ways to make predictions:
1. `predict()` - return single label
2. `predict_proba()` - return with probability

In [54]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [55]:
# Use a trained model to make predictions
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [56]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [57]:
# Compare predictions to trust labels to evaluate the model 
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.8524590163934426

In [58]:
clf.score(X_test, y_test)

0.8524590163934426

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.8524590163934426

Make prediction with `predict_proba()`


In [60]:
# predict_proba() return probabilities of a classification label
clf.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [61]:
# Let's predict() on the same data...
clf.predict(X_test[:5])

array([0, 1, 1, 0, 1])

In [62]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

`predict()` can also be used for regression models

In [64]:
from sklearn.ensemble import RandomForestRegressor 

np.random.seed(42)

# Create the dats 
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split into training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate and fit model 
model = RandomForestRegressor().fit(X_train, y_train)

# Make prediction 
y_preds = model.predict(X_test)


In [66]:
y_preds[:10]

array([23.002, 30.826, 16.734, 23.467, 16.853, 21.725, 19.232, 15.239,
       21.067, 20.738])

In [67]:
np.array(y_test[:10])

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [68]:
# Compare the predictions to the truth (mean absolute error)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

2.1226372549019623