In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = pd.read_csv('bike.csv')
df.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,679,2018-05-01 00:00:00,2018-05-01 00:11:19,31302,Wisconsin Ave & Newark St NW,31307,3000 Connecticut Ave NW / National Zoo,W22771,Member
1,578,2018-05-01 00:00:20,2018-05-01 00:09:59,31232,7th & F St NW / National Portrait Gallery,31609,Maine Ave & 7th St SW,W21320,Casual
2,580,2018-05-01 00:00:28,2018-05-01 00:10:09,31232,7th & F St NW / National Portrait Gallery,31609,Maine Ave & 7th St SW,W20863,Casual
3,606,2018-05-01 00:01:22,2018-05-01 00:11:29,31104,Adams Mill & Columbia Rd NW,31509,New Jersey Ave & R St NW,W00822,Member
4,582,2018-05-01 00:04:52,2018-05-01 00:14:34,31129,15th St & Pennsylvania Ave NW/Pershing Park,31118,3rd & Elm St NW,W21846,Member


In [3]:
df.shape

(374115, 9)

In [4]:
X = df.iloc[:,[0,3,5]]
Y = df.iloc[:,-1]

In [5]:
X.head()

Unnamed: 0,Duration,Start station number,End station number
0,679,31302,31307
1,578,31232,31609
2,580,31232,31609
3,606,31104,31509
4,582,31129,31118


In [6]:
Y.value_counts()

Member    271812
Casual    102303
Name: Member type, dtype: int64

In [7]:
Y = Y.map({'Member' : 0, 'Casual' : 1})
Y.value_counts()

0    271812
1    102303
Name: Member type, dtype: int64

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.2)

In [9]:
classifier = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=4,random_state=42)
classifier.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [10]:
Y_pred = classifier.predict(X_test)
print(f"ACCURACY : {accuracy_score(Y_pred, Y_test) * 100}")
print(f"CONFUSION MATRIX : \n{confusion_matrix(Y_test, Y_pred)}")
print(f"CLASSIFICATION REPORT : \n{classification_report(Y_test, Y_pred)}")

ACCURACY : 80.52203199550941
CONFUSION MATRIX : 
[[48527  5852]
 [ 8722 11722]]
CLASSIFICATION REPORT : 
              precision    recall  f1-score   support

           0       0.85      0.89      0.87     54379
           1       0.67      0.57      0.62     20444

    accuracy                           0.81     74823
   macro avg       0.76      0.73      0.74     74823
weighted avg       0.80      0.81      0.80     74823



In [11]:
param_grid = {'criterion' : ['entropy', 'gini'], 'min_samples_leaf' : [1,2,3,4]}
model = DecisionTreeClassifier()
grid = GridSearchCV(model, param_grid=param_grid, cv=10, verbose=2, n_jobs=-1, scoring='accuracy')
grid.fit(X_train, Y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   35.7s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['entropy', 'gini'],
                         'min_samples_leaf': [1, 2, 3, 4]},
             pre_d

In [12]:
grid.best_params_

{'criterion': 'gini', 'min_samples_leaf': 4}

In [13]:
classifier = DecisionTreeClassifier(criterion='gini', min_samples_leaf=4, random_state=42)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
print(f"ACCURACY : {accuracy_score(Y_pred, Y_test) * 100}")

ACCURACY : 80.69042941341567
