# Introduction to Scikit-Learn (sklearn)

### An end-to-end Scikit-Learn workflow
### Getting the data ready
### Choose the right estimator/algorithm for our problems
### Fit the model algorithm and use it to make predictions          on our data
### Evaluating a model
### Improving the model
### Save and load a trained model
### Putting it all together!

# 0. Scikit-Learn Workflow

In [1]:
# 1. Get the data ready
import pandas as pd
heart_disease = pd.read_csv("heart_disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [24]:
import sklearn
import numpy as np

In [25]:
# feautres matrix
x = heart_disease.drop("target", axis = 1)

# labels
y = heart_disease["target"]

In [26]:
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [27]:
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [28]:
# Choosing the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

# keep the default hyperparameters
clf.get_params # to see the hyperparameters

<bound method BaseEstimator.get_params of RandomForestClassifier()>

In [29]:
# 3. Fit the model to the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [30]:
clf.fit(X_train, y_train);

In [31]:
import numpy as np

In [32]:
# making a prediction
y_label = clf.predict(np.array([0, 2, 3, 4]))

ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
y_preds = clf.predict(X_test)
y_preds

In [None]:
y_test

In [None]:
# 4. Evaluate the model
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5. Improve the model
# trying different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators = i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

In [None]:
# Save the model and load it

In [None]:
import pickle
pickle.dump(clf, open("random_forest_mode_1.pkl","wb"))

In [None]:
#import the model
loaded_model = pickle.load(open("random_forest_mode_1.pkl","rb"))
loaded_model.score(X_test, y_test)

#### 

#### 

#### 

#### 

 ### 1. Getting the data ready to be used with machine learning
 Three main things
     1. Split the data into features and labels ( usually X and y).
     2. Filling ( also called imputing) or disregarding missing values.
     3. Converting non-numerical values to numerical values ( also called
        feature encoding).

In [None]:
heart_disease.head(6)

In [None]:
X = heart_disease.drop("target", axis = 1) # age, sex, cp etc 

In [None]:
y = heart_disease["target"]
y.head(6)

In [None]:
X.head(6)

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X.shape

In [None]:
len(heart_disease)

In [None]:
X.shape[0] * 0.4

In [None]:
242 + 61

### 1.1 Make sure it's all numerical

In [None]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head(6)

In [None]:
car_sales.dtypes

In [None]:
# Split the data into x and y
X = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

# Split into training and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)



In [None]:
# Building machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# ValueError: could not convert string to float: 'Honda'
# Lets turn the categories into numbers ( integers)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")

transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

In [None]:
X.head()

In [None]:
pd.DataFrame(transformed_X)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Refitting the model
np.random.seed(500)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                   y,
                                                   test_size = 0.2)

model.fit(X_train, y_train)

In [None]:
X.head()

In [None]:
model.score(X_test, y_test)

### 1.2 What if there were missing values?

    1. Fill them with some value ( imputation).
    2. Remove the samples with missing data altogether.
    

In [None]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Creating X and y
X = car_sales_missing.drop("Price", axis = 1) # what we are trying to predict
y = car_sales_missing["Price"]

In [None]:
# Converting our data to numbers
# ValueError: could not convert string to float: 'Honda'
# Lets turn the categories into numbers ( integers)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X


###  Option 1: Fill missing data with Pandas

In [None]:
# Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace = True)

# colour
car_sales_missing["Colour"].fillna("missing", inplace = True)

car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace = True)

car_sales_missing["Doors"].fillna(4, inplace = True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [None]:
X = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

### Option 2: Fill missing values with Scikit-Learn

In [None]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(subset = ["Price"], inplace = True)
car_sales_missing.isna().sum()

In [None]:
# splitting into x & y
X = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [None]:
X.isna().sum()

In [None]:
# fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical
# values with mean

cat_imputer = SimpleImputer(strategy = "constant", fill_value = "missing")
door_imputer = SimpleImputer(strategy = "constant", fill_value = 4)
num_imputer = SimpleImputer(strategy = "mean")


# Define columns

cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)

imputer = ColumnTransformer ([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)  
])

# Transform the data
filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X,
                                columns = ["Make", "Colour", "Door", "Odometer (KM)"])

car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
car_sales_filled.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Door"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder = "passthrough")

transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

In [None]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                   y,
                                                   test_size = 0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

# 2. Choosing the right estimator/algorithm for our problem

    *Sckit-Learn uses estimator as another term for machine learning mode      or algorithm.

 *Classification - predicting whether a sample is one things or not
 *Regression     - predicting a number
 
     Step 1 - Check the Scikit-Learn machine learning map.

### 2.1 Picking a machine learning mode for a regression problem

In [55]:
# Import Boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [56]:
boston_df = pd.DataFrame(boston["data"], columns = boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7


# How many samples?

In [57]:
len(boston_df)

506

In [2]:
# Lets try the Ridge Regression Model
from sklearn.linear_model import Ridge
import numpy as np
#random seed
np.random.seed(42)

# creating the data
X = boston_df.drop("target", axis = 1)
y = boston_df["target"]

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# instantiate ridge model

model = Ridge()
model.fit(X_train, y_train)

# Check the score of the Ridge model on test data
model.score(X_test, y_test)

NameError: name 'boston_df' is not defined

In [59]:
# How do we improve this score?

# Random Forest Regressor

from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = boston_df.drop("target", axis = 1)
y = boston_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

rf = RandomForestRegressor(n_estimators=100) ### Hyperparameter
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.8654448653350507

In [60]:
# Check the Ridge model again
model.score(X_test, y_test)

0.6662221670168522

In [61]:
### 2.2 Choosing and estimator for a classification problem
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head(6)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1


In [62]:
# Linear SVC
from sklearn.svm import LinearSVC

# setting up a random seed
np.random.seed(42)

# making the date
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# instantiate LinearSVC
clf = LinearSVC(max_iter = 1000)
clf.fit(X_train, y_train)

# Evaluate the Linear SVC
clf.score(X_test, y_test)



0.8688524590163934

In [63]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [64]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [65]:
# RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# setting up a random seed
np.random.seed(42)

# making the date
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)

# Evaluate the Linear SVC
clf.score(X_test, y_test)

0.8524590163934426

Tidbit:
    1. If you have structured data, use ensemble methods
    2. If you have untstructured data, use deep learning or
        transfer learning.
    

In [66]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [67]:
### Fit the model/algorithm on our data and use it to make predictions.
    3.1 Fitting the model to the data
    X = features, features variables, data
    y = labels, targets variables

IndentationError: unexpected indent (<ipython-input-67-ee1c22ecb2f7>, line 2)

In [68]:
# RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# setting up a random seed
np.random.seed(42)

# making the date
X = heart_disease.drop("target", axis = 1)
y = heart_disease["target"]

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# instantiate RandomForestClassifier 
clf = RandomForestClassifier(n_estimators = 100)

#fit the model to the data(training the machine learning model)
clf.fit(X_train, y_train)

# Evaluate the RandomForestclassifier by using the patterns that model has learned)
clf.score(X_test, y_test)

0.8524590163934426

In [69]:
X.head(6)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1


In [70]:
y.tail(6)

297    0
298    0
299    0
300    0
301    0
302    0
Name: target, dtype: int64

In [71]:
### 3.2 Make prediction using a machine learning model
# now that the models has learned  from the data.
how can we use what its learned to make some predictions on the data the model has not seen yet

    1. predict()
    2. predict_proba()

SyntaxError: invalid syntax (<ipython-input-71-29b7f6590cec>, line 3)

In [89]:
# use a trained model to make predictions

clf.predict(np.array([1, 7, 8, 3, 4]))


ValueError: Expected 2D array, got 1D array instead:
array=[1. 7. 8. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [73]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [74]:
clf.predict(X_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [75]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [76]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.8524590163934426

In [77]:
clf.score(X_test, y_test)

0.8524590163934426

In [78]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)
# accuracy classification score

0.8524590163934426

# Make predictions with predict_proba()

In [79]:
probaX = clf.predict_proba(X_test)

In [80]:
clf.predict(X_test[:5])

array([0, 1, 1, 0, 1], dtype=int64)

    25 < 74 so its 0 in predict
    94 > 6 so its 1 in predict

In [81]:
X_test[:5]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2


In [82]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [83]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [84]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)


# Creating the data
X = boston_df.drop("target", axis = 1)
y = boston_df["target"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Instantiate and fit the model
model = RandomForestRegressor(n_estimators = 100).fit(X_train, y_train)

y_preds = model.predict(X_test)

In [85]:
y_preds[:10]

array([23.081, 30.574, 16.759, 23.46 , 16.893, 21.644, 19.113, 15.334,
       21.14 , 20.639])

In [86]:
np.array(y_test[:10])

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [87]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

2.136382352941176