In [31]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Generating a synthetic classification dataset
X, y = make_classification(
    n_samples=1000,      # Number of samples
    n_features=5,        # Number of features
    n_informative=5,     # All features are informative
    n_redundant=0,       # No redundant features
    n_clusters_per_class=1,  # Single cluster per class
    random_state=42       # Random seed for reproducibility
)

# Creating a DataFrame for better visualization (optional)
columns = [f"feature_{i+1}" for i in range(X.shape[1])]
df = pd.DataFrame(data=X, columns=columns)
df['target'] = y

# Displaying the dataset
print("Synthetic Classification Dataset:")
print(df.head())


Synthetic Classification Dataset:
   feature_1  feature_2  feature_3  feature_4  feature_5  target
0   0.978226   3.730759   0.592500  -1.996652  -2.622632       1
1   1.259748  -1.418007   0.681518   0.913182   0.869064       0
2   3.017070   2.163255  -0.214399  -1.624660  -0.747437       1
3   0.863346   2.257953   0.579205  -1.406210   0.654031       1
4   1.109924  -0.181526   0.976281  -1.814203  -0.766370       1


In [19]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,0.978226,3.730759,0.592500,-1.996652,-2.622632,1
1,1.259748,-1.418007,0.681518,0.913182,0.869064,0
2,3.017070,2.163255,-0.214399,-1.624660,-0.747437,1
3,0.863346,2.257953,0.579205,-1.406210,0.654031,1
4,1.109924,-0.181526,0.976281,-1.814203,-0.766370,1
...,...,...,...,...,...,...
995,1.279796,1.081623,-1.091852,-2.100012,-0.652004,1
996,1.152140,1.782861,1.661671,-1.418951,-0.608912,1
997,1.912682,1.736514,1.297099,-0.939928,1.156737,1
998,-1.963771,0.510342,-0.368345,0.622187,2.694235,0


In [2]:
x = list(df.columns.difference(['target']))
y = 'target'

In [3]:
#train-test split
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df[x], df[y], test_size=0.3, random_state=1)

##### Decision trees

In [34]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, _tree
from sklearn.model_selection import GridSearchCV

In [9]:
param_grid_dt = {'max_depth':[3,4,5], 'max_features':[3,4,5], 'max_leaf_nodes':[5, 10, 20, 30], 'min_samples_split':[7]}

In [10]:
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param_grid_dt, 
                       cv=5, n_jobs=-1, verbose=True, scoring='roc_auc')


dt_grid

In [12]:
dt_grid.fit(train_x, train_y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [15]:
print('best_params : ', dt_grid.best_params_)
print('best_score : ', dt_grid.best_score_)

best_params :  {'max_depth': 5, 'max_features': 4, 'max_leaf_nodes': 20, 'min_samples_split': 7}
best_score :  0.9709200957616415


In [14]:
dt_train_score = pd.DataFrame({'y_actual':train_y,'y_pred':dt_grid.predict(train_x)})

dt_test_score = pd.DataFrame({'y_actual':test_y,'y_pred':dt_grid.predict(test_x)})

In [16]:
from sklearn.metrics import classification_report, roc_auc_score

In [17]:
print('For Train data')
print('roc_auc_score', roc_auc_score(dt_train_score.y_actual, dt_train_score.y_pred))
print('_________________________________________________________')
print('For Test data')
print('roc_auc_score', roc_auc_score(dt_test_score.y_actual, dt_test_score.y_pred))

For Train data
roc_auc_score 0.9818477798501358
_________________________________________________________
For Test data
roc_auc_score 0.9626262626262627


In [18]:
print('For Train data')
print(classification_report(dt_train_score.y_actual, dt_train_score.y_pred))
print('_________________________________________________________')
print('For Test data')
print(classification_report(dt_test_score.y_actual, dt_test_score.y_pred))

For Train data
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       366
           1       0.97      0.99      0.98       334

    accuracy                           0.98       700
   macro avg       0.98      0.98      0.98       700
weighted avg       0.98      0.98      0.98       700

_________________________________________________________
For Test data
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       135
           1       0.96      0.97      0.97       165

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300



In [51]:
from sklearn.tree import plot_tree

plt.figure(figsize=(15, 10))
plot_tree(dt_grid.best_estimator_, filled=True, feature_names=x, class_names=['0', '1'], rounded=True)
plt.show()

<IPython.core.display.Javascript object>

In [35]:
# Extracting rules from the decision tree
tree_rules = []

def extract_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            rule = f"{name} <= {threshold:.2f}"
            tree_rules.append(rule)
            recurse(tree_.children_left[node])
            
            rule = f"{name} > {threshold:.2f}"
            tree_rules.append(rule)
            recurse(tree_.children_right[node])
        else:
            rule = f"Class {np.argmax(tree_.value[node])}"
            tree_rules.append(rule)

    recurse(0)

extract_rules(dt_grid.best_estimator_, x)

# Displaying extracted rules
print("Extracted Rules:")
for rule in tree_rules:
    print(rule)

Extracted Rules:
feature_2 <= 0.65
feature_5 <= 1.99
feature_2 <= -0.28
feature_5 <= 1.61
feature_4 <= -0.27
Class 1
feature_4 > -0.27
Class 0
feature_5 > 1.61
feature_4 <= 0.41
Class 1
feature_4 > 0.41
Class 0
feature_2 > -0.28
feature_4 <= 0.10
Class 1
feature_4 > 0.10
Class 0
feature_5 > 1.99
feature_4 <= 0.62
feature_3 <= -0.31
Class 0
feature_3 > -0.31
Class 1
feature_4 > 0.62
feature_3 <= 1.63
Class 0
feature_3 > 1.63
Class 1
feature_2 > 0.65
feature_4 <= 0.43
Class 1
feature_4 > 0.43
Class 0


In [38]:
dt_grid.best_params_

{'max_depth': 5,
 'max_features': 4,
 'max_leaf_nodes': 20,
 'min_samples_split': 7}

In [46]:
x_dt = DecisionTreeClassifier(max_depth= 5,max_features= 4,max_leaf_nodes= 20,min_samples_split= 7).fit(train_x, train_y)

In [53]:
plot_tree(x_dt,filled=True,rounded=True)

[Text(0.46296296296296297, 0.9166666666666666, 'x[3] <= -0.008\ngini = 0.499\nsamples = 700\nvalue = [366, 334]'),
 Text(0.24074074074074073, 0.75, 'x[1] <= -1.145\ngini = 0.096\nsamples = 317\nvalue = [16, 301]'),
 Text(0.14814814814814814, 0.5833333333333334, 'x[4] <= 1.376\ngini = 0.499\nsamples = 31\nvalue = [15, 16]'),
 Text(0.07407407407407407, 0.4166666666666667, 'x[0] <= 1.125\ngini = 0.291\nsamples = 17\nvalue = [14, 3]'),
 Text(0.037037037037037035, 0.25, 'gini = 0.0\nsamples = 11\nvalue = [11, 0]'),
 Text(0.1111111111111111, 0.25, 'gini = 0.5\nsamples = 6\nvalue = [3, 3]'),
 Text(0.2222222222222222, 0.4166666666666667, 'x[0] <= -1.632\ngini = 0.133\nsamples = 14\nvalue = [1, 13]'),
 Text(0.18518518518518517, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]'),
 Text(0.25925925925925924, 0.25, 'gini = 0.0\nsamples = 13\nvalue = [0, 13]'),
 Text(0.3333333333333333, 0.5833333333333334, 'x[0] <= -2.337\ngini = 0.007\nsamples = 286\nvalue = [1, 285]'),
 Text(0.2962962962962963, 0.41

In [54]:
x_dt.feature_importances_

array([0.02410442, 0.06683059, 0.00206849, 0.8143248 , 0.0926717 ])

In [79]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
#import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC

In [109]:
svc_mod = SVC(kernel='rbf').fit(train_x, train_y)

In [110]:
svc_train_score = pd.DataFrame({'y_actual':train_y,'y_pred':svc_mod.predict(train_x)})

svc_test_score = pd.DataFrame({'y_actual':test_y,'y_pred':svc_mod.predict(test_x)})

In [111]:
print('For Train data')
print('roc_auc_score', roc_auc_score(svc_train_score.y_actual, svc_train_score.y_pred))
print('_________________________________________________________')
print('For Test data')
print('roc_auc_score', roc_auc_score(svc_train_score.y_actual, svc_train_score.y_pred))

For Train data
roc_auc_score 0.9912797356107458
_________________________________________________________
For Test data
roc_auc_score 0.9912797356107458


In [112]:
print('For Train data')
print(classification_report(svc_test_score.y_actual, svc_test_score.y_pred))
print('_________________________________________________________')
print('For Test data')
print(classification_report(svc_test_score.y_actual, svc_test_score.y_pred))

For Train data
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       135
           1       0.99      0.97      0.98       165

    accuracy                           0.98       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.98      0.98      0.98       300

_________________________________________________________
For Test data
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       135
           1       0.99      0.97      0.98       165

    accuracy                           0.98       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.98      0.98      0.98       300

