### Decision Trees (CLASSIFIER AND REGRESSOR)

### DECISION TREES REGRESSOR

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv(r'C:\Users\hp\Desktop\New folder\machine-learning-notes\data\insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# preprocess the data.

X = data.drop(columns=['charges'])
y = data['charges']


# encode the data
encoder = LabelEncoder()
cat_cols = list(data.select_dtypes(include="object").columns)
for col in cat_cols:
    X[col] = encoder.fit_transform(X[col])
    
X.head()
    

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [4]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

#scale both train and test set

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Train the Base Model

In [5]:
model = DecisionTreeRegressor(random_state=23)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# evaluate the performance of the base model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)

print(f'Train score: {train_rmse}')
print(f'Test score: {test_rmse}')

Train score: 413.77689999135333
Test score: 6078.2133795882


## A slightly optimized Model

In [8]:
model = DecisionTreeRegressor(random_state=23, max_depth=4)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# evaluate the performance of the base model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)

print(f'Train score: {train_rmse}')
print(f'Test score: {test_rmse}')

Train score: 4387.8194415568905
Test score: 4519.112521297888


### Classification

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [11]:
data = pd.read_csv(r'C:\Users\hp\Desktop\New folder\machine-learning-notes\data\heart_disease.csv',
                   index_col=0)
data.head()

Unnamed: 0,age,sex,chest pain type,resting bps,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289.0,0,0,172,0,0.0,1,0
1,49,0,3,160,180.0,0,0,156,0,1.0,2,1
2,37,1,2,130,283.0,0,1,98,0,0.0,1,0
3,48,0,4,138,214.0,0,0,108,1,1.5,2,1
4,54,1,3,150,195.0,0,0,122,0,0.0,1,0


In [13]:
# preprocess the data.

X = data.drop(columns=['target'])
y = data['target']


In [15]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=23, stratify=y)

#scale both train and test set

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### Train the BaseMOdel

In [18]:
model = DecisionTreeClassifier(random_state=23)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# evaluate the performance of the base model
print(f'train: {classification_report(y_train, train_preds)}')
print("*"*60)
print(f'test: {classification_report(y_test, test_preds)}')

train:               precision    recall  f1-score   support

           0       1.00      1.00      1.00       422
           1       1.00      1.00      1.00       416

    accuracy                           1.00       838
   macro avg       1.00      1.00      1.00       838
weighted avg       1.00      1.00      1.00       838

************************************************************
test:               precision    recall  f1-score   support

           0       0.72      0.74      0.73       106
           1       0.72      0.70      0.71       104

    accuracy                           0.72       210
   macro avg       0.72      0.72      0.72       210
weighted avg       0.72      0.72      0.72       210



### Apply Early Stopping

In [24]:
model = DecisionTreeClassifier(random_state=23, max_depth=3)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

# evaluate the performance of the base model
print(f'train: {classification_report(y_train, train_preds)}')
print("*"*60)
print(f'test: {classification_report(y_test, test_preds)}')

train:               precision    recall  f1-score   support

           0       0.85      0.73      0.79       422
           1       0.76      0.87      0.81       416

    accuracy                           0.80       838
   macro avg       0.81      0.80      0.80       838
weighted avg       0.81      0.80      0.80       838

************************************************************
test:               precision    recall  f1-score   support

           0       0.81      0.73      0.77       106
           1       0.75      0.83      0.79       104

    accuracy                           0.78       210
   macro avg       0.78      0.78      0.78       210
weighted avg       0.78      0.78      0.78       210

