In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
auto = pd.read_csv('auto.csv')
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 7 columns):
mpg       392 non-null float64
displ     392 non-null float64
hp        392 non-null int64
weight    392 non-null int64
accel     392 non-null float64
origin    392 non-null object
size      392 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 21.5+ KB


In [3]:
X = pd.get_dummies(auto.drop(['mpg'], axis=1))
X.info()
y = auto['mpg']
SEED = 1
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=SEED)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
displ            392 non-null float64
hp               392 non-null int64
weight           392 non-null int64
accel            392 non-null float64
size             392 non-null float64
origin_Asia      392 non-null uint8
origin_Europe    392 non-null uint8
origin_US        392 non-null uint8
dtypes: float64(3), int64(2), uint8(3)
memory usage: 16.5 KB


In [4]:
# Instantiate a DecisionTreeRegressor dt
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26, random_state=SEED)

In [5]:
from sklearn.model_selection import cross_val_score

# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv= 10, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1)

# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(1/2)

# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

CV RMSE: 5.14


In [6]:
# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Fit dt to the training set
dt.fit(X_train, y_train)

# Predict the labels of the training set
y_pred_train = dt.predict(X_train)

# Evaluate the training set RMSE of dt
RMSE_train = MSE(y_train, y_pred_train)**(1/2)

# Print RMSE_train
print('Train RMSE: {:.2f}'.format(RMSE_train))

Train RMSE: 5.15


dt suffers from high bias because RMSE_CV ≈ RMSE_train and both scores are greater than baseline_RMSE (5.1 from lr model built in Chapter 1.

dt is underfitting the training set as the model is too constrained to capture the nonlinear dependencies between features and labels.

In [7]:
ilp = pd.read_csv("indian_liver_patient_preprocessed.csv")
ilp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 12 columns):
Unnamed: 0                        579 non-null int64
Age_std                           579 non-null float64
Total_Bilirubin_std               579 non-null float64
Direct_Bilirubin_std              579 non-null float64
Alkaline_Phosphotase_std          579 non-null float64
Alamine_Aminotransferase_std      579 non-null float64
Aspartate_Aminotransferase_std    579 non-null float64
Total_Protiens_std                579 non-null float64
Albumin_std                       579 non-null float64
Albumin_and_Globulin_Ratio_std    579 non-null float64
Is_male_std                       579 non-null int64
Liver_disease                     579 non-null int64
dtypes: float64(9), int64(3)
memory usage: 54.4 KB


In [8]:
X = ilp.drop(['Liver_disease'], axis=1)
X.info()
y = ilp['Liver_disease']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 11 columns):
Unnamed: 0                        579 non-null int64
Age_std                           579 non-null float64
Total_Bilirubin_std               579 non-null float64
Direct_Bilirubin_std              579 non-null float64
Alkaline_Phosphotase_std          579 non-null float64
Alamine_Aminotransferase_std      579 non-null float64
Aspartate_Aminotransferase_std    579 non-null float64
Total_Protiens_std                579 non-null float64
Albumin_std                       579 non-null float64
Albumin_and_Globulin_Ratio_std    579 non-null float64
Is_male_std                       579 non-null int64
dtypes: float64(9), int64(2)
memory usage: 49.8 KB


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [10]:
# Set seed for reproducibility
SEED=1

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

In [11]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=SEED)

In [12]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

Logistic Regression : 0.747
K Nearest Neighbours : 0.724
Classification Tree : 0.730




In [13]:
# Import VotingCLassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

Voting Classifier: 0.753


