# This noteboook contains the functions of sklearn

What we are going to cover:
0. End to end sklearn workflow
1. Getting the data ready
2. Choosing the right estimator/algo for our problem
3. Fit the model on the data and make predictions
4. Evaluate the model
5. Improve the model
6. Save and train the model
8. Putting it all together

# 0. End to end sklearn workflow

In [57]:
what_we_are_going_to_do = [
'0. End to end sklearn workflow',
'1. Getting the data ready',
'2. Choosing the right estimator/algo for our problem',
'3. Fit the model on the data and make predictions',
'4. Evaluate the model',
'5. Improve the model',
'6. Save and train the model',
'8. Putting it all together',
]


In [1]:
# 1. Getting the data ready
import pandas as pd
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [6]:
# Create X label
X = heart_disease.drop("target" , axis = 1)
X
# Create Y label
Y = heart_disease['target']
Y

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [44]:
# 2. Choosing the right model
from sklearn.ensemble import RandomForestClassifier


clrf = RandomForestClassifier()
# We'll keep the default parameters
clrf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [45]:
#3. Fit the model to the training data
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size=0.2)

In [46]:
clrf.fit(x_train , y_train)

In [47]:
# Make Prediction
y_preds = clrf.predict(x_test)
y_preds

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

In [48]:
# 4. Evaluate the model
clrf.score(x_train , y_train)

1.0

In [49]:
clrf.score(x_test , y_test)

0.7540983606557377

In [50]:
from sklearn.metrics import confusion_matrix , accuracy_score, classification_report

print(classification_report(y_test , y_preds))

              precision    recall  f1-score   support

           0       0.79      0.71      0.75        31
           1       0.73      0.80      0.76        30

    accuracy                           0.75        61
   macro avg       0.76      0.75      0.75        61
weighted avg       0.76      0.75      0.75        61



In [51]:
confusion_matrix(y_test , y_preds)

array([[22,  9],
       [ 6, 24]], dtype=int64)

In [52]:
accuracy_score(y_test , y_preds)

0.7540983606557377

In [53]:
# 5. Improve the model
import numpy as np
# Try different nestimators (hyperparamter)

np.random.seed(0)

for i in range(10 , 100 , 10):
    print(f"Trying with {i} estimators...")
    clrf = RandomForestClassifier(n_estimators=i).fit(x_train , y_train)
    print(f"Model with accuracy: {clrf.score(x_test , y_test)}")

Trying with 10 estimators...
Model with accuracy: 0.7540983606557377
Trying with 20 estimators...
Model with accuracy: 0.8032786885245902
Trying with 30 estimators...
Model with accuracy: 0.819672131147541
Trying with 40 estimators...
Model with accuracy: 0.8032786885245902
Trying with 50 estimators...
Model with accuracy: 0.819672131147541
Trying with 60 estimators...
Model with accuracy: 0.8032786885245902
Trying with 70 estimators...
Model with accuracy: 0.7704918032786885
Trying with 80 estimators...
Model with accuracy: 0.8032786885245902
Trying with 90 estimators...
Model with accuracy: 0.819672131147541


In [54]:
# 6.  Save a model and load it

import pickle
pickle.dump(clrf , open("random-forest-model1.pk1" , "wb"))

In [56]:
load_model = pickle.load(open("random-forest-model1.pk1","rb"))
load_model.score(x_test , y_test)

0.819672131147541

In [58]:
what_we_are_going_to_do

['0. End to end sklearn workflow',
 '1. Getting the data ready',
 '2. Choosing the right estimator/algo for our problem',
 '3. Fit the model on the data and make predictions',
 '4. Evaluate the model',
 '5. Improve the model',
 '6. Save and train the model',
 '8. Putting it all together']

--
# 1. Getting Data Ready
This part typically contains three steps:
1. Split the data into features and labels (X and Y)
2. Filling (also called imputing) or disregarding the missing value
3. Converting non numerical values to numerical (feature encoding)

In [2]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X = heart_disease.drop("target" , axis = 1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [4]:
Y = heart_disease['target']
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [5]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size=0.2)

## 1. Make sure data is numerical

In [27]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
5,Honda,Red,42652,4,23883
6,Toyota,Blue,163453,4,8473
7,Honda,White,43120,4,20306
8,Nissan,White,130538,4,9374
9,Honda,Blue,51029,4,26683


In [28]:
len(car_sales)

1000

In [29]:
# Split into X and Y
X = car_sales.drop("Price" , axis = 1)
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [30]:
Y = car_sales["Price"]
Y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

In [31]:
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size = 0.2)

In [32]:
# Build ML model
from sklearn.ensemble import RandomForestRegressor

In [33]:
model = RandomForestRegressor()
model.fit(x_train , y_train)
model.predict(x_test)

ValueError: could not convert string to float: 'Honda'

In [34]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make" , "Colour" , "Doors"] # Doors because it has also only 3 values
one_hot =  OneHotEncoder()
transformer = ColumnTransformer([("onehot" , one_hot , categorical_features)] , remainder = "passthrough")

transformed_x = transformer.fit_transform(X)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [46]:
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [45]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [48]:
pd.get_dummies(car_sales[["Make" , "Colour" , "Doors"]])

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [42]:
import numpy as np
np.random.seed(42)
x_train , x_test , y_train , y_test = train_test_split(transformed_x , Y , test_size = 0.2)

In [43]:
model.fit(x_train , y_train)

In [44]:
model.score(x_test , y_test)

0.3235867221569877

### 1.2 Data with missing values
1. Fill with some value also called as imputation
2. Remove data with missing data

In [58]:
car_sales_missing_data = pd.read_csv("car-sales-extended-missing-data.csv")

In [59]:
car_sales_missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [60]:
X = car_sales_missing_data.drop("Price" , axis=1)
Y = car_sales_missing_data["Price"]

In [63]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make" , "Colour" , "Doors"] # Doors because it has also only 3 values
one_hot =  OneHotEncoder()
transformer = ColumnTransformer([("onehot" , one_hot , categorical_features)] , remainder = "passthrough")

transformed_x = transformer.fit_transform(car_sales_missing_data)
transformed_x

<1000x17 sparse matrix of type '<class 'numpy.float64'>'
	with 5000 stored elements in Compressed Sparse Row format>

In [62]:
car_sales_missing_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [64]:
car_sales_missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## Option 1: Fill missing values using Pandas

In [66]:
# Filling missing values depending on the type of the feature
car_sales_missing_data["Make"].fillna("missing" , inplace=True)
car_sales_missing_data["Colour"].fillna("missing" , inplace=True)
car_sales_missing_data["Odometer (KM)"].fillna(car_sales_missing_data["Odometer (KM)"].mean() , inplace=True)
car_sales_missing_data["Doors"].fillna(4 , inplace=True)

In [67]:
car_sales_missing_data.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [68]:
car_sales_missing_data.dropna(inplace=True)

In [69]:
car_sales_missing_data.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [70]:
len(car_sales_missing_data)

950

In [71]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make" , "Colour" , "Doors"] # Doors because it has also only 3 values
one_hot =  OneHotEncoder()
transformer = ColumnTransformer([("onehot" , one_hot , categorical_features)] , remainder = "passthrough")

transformed_x = transformer.fit_transform(car_sales_missing_data)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

## Option 2: Filing missing value with scikit learn

In [72]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

car_sales_missing_data = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [73]:
# Dropiing the rows with no price label.
car_sales_missing_data.dropna(subset = "Price" , inplace=True)
car_sales_missing_data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [74]:
# Splitting data into X and Y labels
X = car_sales_missing_data.drop("Price" , axis=1)
Y = car_sales_missing_data["Price"]

In [75]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [76]:
# Define the imputers and transform the X data
cat_imputer = SimpleImputer(strategy="constant" , fill_value="missing")
door_imputer = SimpleImputer(strategy="constant" , fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# DEfining the categories
cat_col = ["Colour" , "Make"]
num_col = ["Odometer (KM)"]
door_col = ["Doors"]

transformer = ColumnTransformer([
    ("cat_imputer" , cat_imputer , cat_col),
    ("num_imputer" , num_imputer , num_col),
    ("door_imputer" , door_imputer , door_col),
])
filled_X = transformer.fit_transform(X)
filled_X

array([['White', 'Honda', 35431.0, 4.0],
       ['Blue', 'BMW', 192714.0, 5.0],
       ['White', 'Honda', 84714.0, 4.0],
       ...,
       ['Blue', 'Nissan', 66604.0, 4.0],
       ['White', 'Honda', 215883.0, 4.0],
       ['Blue', 'Toyota', 248360.0, 4.0]], dtype=object)

In [78]:
car_sales_filled = pd.DataFrame(filled_X , columns=["Colour","Make" , "Odometer (KM)" , "Doors"])
car_sales_filled

Unnamed: 0,Colour,Make,Odometer (KM),Doors
0,White,Honda,35431.0,4.0
1,Blue,BMW,192714.0,5.0
2,White,Honda,84714.0,4.0
3,White,Toyota,154365.0,4.0
4,Blue,Nissan,181577.0,3.0
...,...,...,...,...
945,Black,Toyota,35820.0,4.0
946,White,missing,155144.0,3.0
947,Blue,Nissan,66604.0,4.0
948,White,Honda,215883.0,4.0


In [79]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make" , "Colour" , "Doors"] # Doors because it has also only 3 values
one_hot =  OneHotEncoder()
transformer = ColumnTransformer([("onehot" , one_hot , categorical_features)] , remainder = "passthrough")

transformed_x = transformer.fit_transform(car_sales_filled)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [80]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)
x_train , x_test , y_train , y_test = train_test_split(transformed_x , Y , test_size = 0.2)

In [81]:
model = RandomForestRegressor()
model.fit(x_train , y_train)
model.score(x_test , y_test)

0.21990196728583944

# 2. Choosing the right estimator
https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In sklearn -> estimator = model or algorithm

In [83]:
#Load the boston data set
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [84]:
housing_df = pd.DataFrame(housing["data"] , columns=housing["feature_names"])
housing_df["target"]=housing["target"]
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [85]:
len(housing_df)

20640

In [86]:
X = housing_df.drop("target" ,axis=1)
Y = housing_df["target"]

In [89]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split



np.random.seed(42)
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size = 0.2)

model = Ridge()
model.fit(x_train , y_train)
model.score(x_test , y_test)

0.5758549611440126

# How to improve the score?
# What if the Ridge was not working?

In [91]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.804420993130102

# 2.1 Choosing the right estimator for a classification problem

In [92]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [93]:
len(heart_disease)

303

##### Consulted the map which tells to use LinearSVC

In [109]:
from sklearn.svm import LinearSVC

model1 = LinearSVC(max_iter=10000)

X = heart_disease.drop("target" , axis=1)
Y = heart_disease['target']
np.random.seed(42)
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size = 0.2)

model1.fit(x_train , y_train)
model1.score(x_test , y_test)




0.8688524590163934

In [110]:
heart_disease["target"].value_counts()

1    165
0    138
Name: target, dtype: int64

In [111]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.8524590163934426

## 3. Fit the model on our data and make predictions
### 3.1 Fitting the model

In [112]:
from sklearn.svm import LinearSVC

model1 = LinearSVC(max_iter=10000)

X = heart_disease.drop("target" , axis=1)
Y = heart_disease['target']
np.random.seed(42)
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size = 0.2)
# Fit the model. It finds patterns in the training data and compare it with the y_train in case of supervised learning
model1.fit(x_train , y_train)
model1.score(x_test , y_test)




0.8688524590163934

### 3.2 Making predications

Using `predict` and `predict_proba()` function.
