In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1. Getting our data ready

Three main things we have to do:
1. Split the data into features and labels (usually 'x' and 'y') 
2. Filling(also called inputting) or disregarding missing values 
3. Converting non-numerical values to numerical values (also called feature encoding)
    

In [2]:
heart_disease = pd.read_csv('../data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
x = heart_disease.drop("target", axis=1) # remove the label which we used for output
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [4]:
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [5]:
#Split the data into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.1 Make sure it's all numerical

In [7]:
car_sales = pd.read_csv('../data/car-sales-extended.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [8]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [9]:
#Split the data
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

#Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)


In [10]:
car_sales["Doors"].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [11]:
#Build machine learning model
from sklearn.ensemble import RandomForestRegressor  #This is not classification problem but predicting number

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

ValueError: could not convert string to float: 'Toyota'

In [None]:
#Turn the categoried into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough") #passthrough means dont do anything on other columns except catrgorical features
transformed_x = transformer.fit_transform(x)
transformed_x
#It does one hot enconding which is mentioned in the notebook


In [None]:
pd.DataFrame(transformed_x)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies #Doors is number so it didn't apply encoding in it
#just like one hot encoding which we did above

In [None]:
#Let's refit the model
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

model.fit(x_train, y_train);

In [None]:
model.score(x_test, y_test)

## 1.2 What if there were missing values?
1. Fill them with some values (also known as imputation)
2. Remove the samples with missing data altogether

In [None]:
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv");
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
#create x and y
x = car_sales_missing.drop("Price")
y=car_sales_missing["Price"]

#### Option1 fill missing data with Pandas


In [None]:
#Fill the make column
car_sales_missing["Make"] = car_sales_missing["Make"].fillna("missing")

car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna("missing")

car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean())

car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(4)

In [None]:
car_sales_missing.isna().sum()

In [None]:
#Remove rows with missing price values
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [None]:
x = car_sales_missing.drop("Price", axis=1)
y=car_sales_missing["Price"]


In [None]:
#Let's try and convert our data to numbers
#Turn the categoried into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough") #passthrough means dont do anything on other columns except catrgorical features
transformed_x = transformer.fit_transform(car_sales_missing)
pd.DataFrame(transformed_x)
#It does one hot enconding which is mentioned in the notebook



In [None]:
### Option2 : fill missing with sklearn 
car_sales_missing = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales_missing

In [None]:
#Drop rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing

In [None]:
#Split into x and y
x = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [None]:
#imputer help us in filling missing values

# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(x_train) # fit_transform imputes the missing values from the training set and fills them simultaneously
filled_X_test = imputer.transform(x_test) # tranform takes the imputing missing values from the training set and fills the test set with them

# Check filled X_train
filled_X_train

# Fill categorical values with 'missing' & numerical values with mean

In [None]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test

In [None]:
car_sales_filled_test.isna().sum()

In [None]:
#Let's try and convert our data to numbers
#Turn the categoried into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough") #passthrough means dont do anything on other columns except catrgorical features
transformed_x_train = transformer.fit_transform(car_sales_filled_train)
transformed_x_test = transformer.fit_transform(car_sales_filled_test)

#It does one hot enconding which is mentioned in the notebook

transformed_x_train.toarray()

In [None]:
##Wonderful! Now we've filled and transformed our data, ensuring the training and test sets have been kept separate. Let's fit a model to the training set and evaluate it on the test set.
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

# Setup model
model = RandomForestRegressor(n_estimators=20)

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(transformed_x_train, y_train)
model.score(transformed_x_test, y_test)

## 2. Choosing the right estimator/algorithm for our problem

Scikit-Learn uses estimator as another term for machine learning model or algorithm

1. Classification - predicting whether a sample is one thing or another
2. Regression - predicting a number

### 2.1 Picking a machine learning model for a regression problem

In [12]:
#import Boston Housing datasets

from sklearn.datasets import load_boston
boston = load_boston()
boston


{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [18]:
#boston is a dictionary
boston_df = pd.DataFrame(boston["data"], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head()
#search boston housing dataset in sklearn

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [19]:
len(boston_df)

506

Reference to the map http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [22]:
#Lets try the Ridge regression model (We are following sklearn map which snap is saved in images folder)
from sklearn.linear_model import Ridge

#Setup random seed
np.random.seed(42)

#Create the data
x = boston_df.drop("target", axis=1)
y = boston_df["target"]

#Split the data
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2)

#Instantiate Ridge model
model = Ridge()
model.fit(x_train, y_train)

model.score(x_test, y_test)

0.6662221670168524

In [24]:
#How do we increase the score?
#What if Ridge Regression not working?
#Ans: We should go for Ensemble method which is Random Forest method

In [27]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

#Create the data
x = boston_df.drop("target", axis=1)
y = boston_df["target"]

#Split the data
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2)

#Instantiate Ridge model
model = RandomForestRegressor()
model.fit(x_train, y_train)

model.score(x_test, y_test)


0.8654448653350507

In [29]:
x_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
477,15.02340,0.0,18.10,0.0,0.6140,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91
15,0.62739,0.0,8.14,0.0,0.5380,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47
332,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83
423,7.05042,0.0,18.10,0.0,0.6140,6.103,85.1,2.0218,24.0,666.0,20.2,2.52,23.29
19,0.72580,0.0,8.14,0.0,0.5380,5.727,69.5,3.7965,4.0,307.0,21.0,390.95,11.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.17120,0.0,8.56,0.0,0.5200,5.836,91.9,2.2110,5.0,384.0,20.9,395.67,18.66
270,0.29916,20.0,6.96,0.0,0.4640,5.856,42.1,4.4290,3.0,223.0,18.6,388.65,13.00
348,0.01501,80.0,2.01,0.0,0.4350,6.635,29.7,8.3440,4.0,280.0,17.0,390.94,5.99
435,11.16040,0.0,18.10,0.0,0.7400,6.629,94.6,2.1247,24.0,666.0,20.2,109.85,23.27


In [34]:
pr = np.array([0.06,2.18,0.0,8.56,0.1,0.32,5.836,24.1,2.71,22.0,664.0,20.1,102.3])
pr2 = pr.reshape(1,-1)
model.predict(pr.reshape(1,-1))

array([19.259])

In [48]:
model.predict_proba(x_test[:5]) 

AttributeError: 'RandomForestRegressor' object has no attribute 'predict_proba'

### 2.2 Choosing the estimator for Classification problem 

In [28]:
heart_disease = pd.read_csv('../data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Consulting the map and it says try LinearSVC

In [38]:
from sklearn.svm import LinearSVC

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

#Split the data
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2)

#instantiate LinearSVC
clf = LinearSVC(max_iter=10000)
clf.fit(x_train, y_train)

#Evaluate SVC
clf.score(x_test, y_test)




0.8688524590163934

In [44]:
check = np.array([12,1,0,130,200,0,0,174,1,0.8,0,0,1])
clf.predict(check.reshape(1,-1))

array([1])

In [37]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [39]:
#Now we used RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

#Split the data
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2)

#instantiate LinearSVC
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

#Evaluate SVC
clf.score(x_test, y_test)


0.8524590163934426

Tidbit: 
1. If you have structured data, used ensemble method
2. If you have unstructured data(images or voice data), used deep learning or transfer learning model

In [40]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [43]:
check = np.array([12,1,0,130,200,0,0,174,1,0.8,0,0,1])
clf.predict(check.reshape(1,-1))

array([1])

## 3. Fit the model/algo on our data and use it to make predictions

### 3.1 Fitting the model to the data

* `x` = features, features variables,data
* `y` = labels, targets, target variables

In [None]:
#Now we used RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

#Split the data
x_train, x_test,y_train, y_test = train_test_split(x,y, test_size=0.2)

#instantiate LinearSVC
clf = RandomForestClassifier()

#Fit the model to the data
clf.fit(x_train, y_train)

#Evaluate SVC (use th pattern the model has learn)
clf.score(x_test, y_test) 


### 3.2 make predictions with predict_proba()

In [46]:
#predict_proba() returns probabilities of a classification

clf.predict_proba(x_test[:5])


array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [47]:
#Let's predict on the same data
clf.predict(x_test[:5])

array([0, 1, 1, 0, 1])

`predict()` can also be used for regression models

In [52]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = boston_df.drop("target", axis=1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

model = RandomForestRegressor().fit(x_train, y_train)

#make predictions
y_preds = model.predict(x_test)
y_preds[:10] #predictions

array([23.081, 30.574, 16.759, 23.46 , 16.893, 21.644, 19.113, 15.334,
       21.14 , 20.639])

In [53]:
np.array(y_test[:10]) #actual labels

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [54]:
#Compare the predictions to the truth

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)


2.136382352941176

In [56]:
#The above results tells us that on avg every predictions has an error of +- 2 
#The predicted result has value which is above or below 2 from it's range

## 4. Evaluating a machine learning model

* Sklearn has 3 different metric function to evaluate the model

    1. Estimator `scoring` method
    2. The `scoring` parameter
    3. Problem-specific metric functions
    
 ### Evaluating a model with `score` method

In [58]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_test, x_train, y_test, y_train = train_test_split(x,y,test_size=0.2)

clf = RandomForestClassifier()
clf.fit(x_train, y_train)


RandomForestClassifier()

In [59]:
clf.score(x_train, y_train)

1.0

In [61]:
clf.score(x_test, y_test)

0.7892561983471075

Let's do the same but for regression model

In [62]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = boston_df.drop("target", axis=1)
y = boston_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

model = RandomForestRegressor().fit(x_train, y_train)

# #make predictions
# y_preds = model.predict(x_test)
# y_preds[:10] #predictions

In [64]:
model.score(x_test, y_test) #both models have different method to score the predictions

0.8654448653350507

### 4.2 Evaluating a model using scoring parameter