**Decision tree model**


In [2]:
# Importing the required packages 
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [3]:
# Load data
y_test = pd.read_csv("data/y_test.csv")
y_train = pd.read_csv("data/y_train.csv")
X_test_scaled = pd.read_csv("data/X_test_scaled.csv")
X_train_scaled = pd.read_csv("data/X_train_scaled.csv")

In [4]:
# Function to perform training with giniIndex. 
def train_using_gini(X_train, y_train): 
        """_summary_
        - trains decision tree classifier with training data, gini criterion, max questions: 3, min samples per leaf: 5
        Args:
            X_train (_dataframe_): _trainingdata_
            y_train (_series_): _target-training-data_

        Returns:
            _classifier object_: _DecisionTreeClassifier_
        """
    
    # Creating the classifier object
        clf_gini = DecisionTreeClassifier(
            criterion = "gini", 
            max_depth = 2, min_samples_leaf = 9) 
    # Performing training 
        clf_gini.fit(X_train, y_train) 
    

        return clf_gini

In [5]:
# Function to perform training with entropy. 
def train_using_entropy(X_train, y_train): 
    """_summary_
        - trains decision tree classifier with training data, criterion: entropy (Shannon entropy), max questions: 3, min samples per leaf: 5
        https://scikit-learn.org/stable/modules/tree.html#tree-mathematical-formulation
        Args:
            X_train (_dataframe_): _trainingdata_
            y_train (_series_): _target-training-data_

        Returns:
            _classifier object_: _DecisionTreeClassifier_
        """
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy",
            max_depth = 2, min_samples_leaf = 9)
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

    

# Randomized search / Parameter tuning

In [6]:


# param_grid = {"C" : [0, 0.1, 0.25, 0.5, 1, 2],
#               "fit_intercept" : [True, False],
#               "penalty" : ["l2", "l1", None,'elasticnet'],
#               "class_weight" : ['balanced', None],
#               "solver" : ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
#               "l1_ratio" : [0.1,0.25,0.5,0.75,0.9]
#              }

param_grid = {"max_depth": [2, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "min_samples_split": randint(1,100),
              "criterion": ["gini", "entropy"]}

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), param_grid, scoring='accuracy',
                  cv=5, verbose=0, n_jobs=-1, n_iter=15, random_state=42)

#rs.fit(X_train_scaled, y_train)

rs.fit(X_test_scaled, y_test)

In [7]:
y_pred_rs = rs.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_rs))

0.8852040816326531


In [8]:
print('Best score:', round(rs.best_score_, 3))

y_pred_rs = rs.predict(X_test_scaled.to_numpy())

print(confusion_matrix(y_test, y_pred_rs))
print(classification_report(y_test, y_pred_rs))
print(accuracy_score(y_test, y_pred_rs))

Best score: 0.878
[[4901  151]
 [ 524  304]]
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.94      5052
         1.0       0.67      0.37      0.47       828

    accuracy                           0.89      5880
   macro avg       0.79      0.67      0.70      5880
weighted avg       0.87      0.89      0.87      5880

0.8852040816326531




In [9]:
print("Tuned Decision Tree Parameters: {}".format(rs.best_params_))
print("Best score is {}".format(rs.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 2, 'min_samples_split': 64}
Best score is 0.8778911564625851


# Predicting

In [10]:
# Function to make predictions 
def prediction(X_test, clf_object): 
    """_summary_

    Args:
        X_test (_dataframe_): _features_
        clf_object (_trained classification object_): __

    Returns:
        _series_: _predicted target values_
    """
  
    # Prediction on test 
    y_pred = clf_object.predict(X_test.to_numpy()) # transform it into a numpy array
    print("Predicted values:\n") 
    print(y_pred) 
    return y_pred 

# Measuring

In [11]:
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
    """_summary_
    prints: convfusion matrix, accuracy score, classification report
    Args:
        y_test (_series_): _true target values_
        y_pred (_series_): _target values_
    """
    
    print("-----"*15)
    print("Confusion Matrix: \n", 
        confusion_matrix(y_test, y_pred)) 
    
    print("-----"*15)
    print ("Accuracy : \n", 
    accuracy_score(y_test, y_pred) * 100) 
    
    print("-----"*15)
    print("Report : \n", 
    classification_report(y_test, y_pred)) 

In [12]:
# Driver code 
def main(): 
    """
    runs following functions in order:
    importdata()
    splitdataset()
    trainusinggini
    trainusingentropy
    predictionfunction using gini model
    predictionfunction using entropy model
    cal_accuracy(): - gini
                    - entropy
    """
      
    # Building Phase 
    #data = importdata() 
    #X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
    clf_gini = train_using_gini(X_train_scaled, y_train) 
    clf_entropy = train_using_entropy(X_train_scaled, y_train) 
      
    # Operational Phase 
    print("-----"*15)
    print("Results Using Gini Index:\n") 
      
    # Prediction using gini 
    y_pred_gini = prediction(X_test_scaled, clf_gini) 
    cal_accuracy(y_test, y_pred_gini) 
    
    print("-----"*15)
    print("Results Using Entropy:\n") 
    # Prediction using entropy 
    y_pred_entropy = prediction(X_test_scaled, clf_entropy) 
    cal_accuracy(y_test, y_pred_entropy)

In [13]:
# Calling main function 
if __name__ == "__main__": 
    main() 

---------------------------------------------------------------------------
Results Using Gini Index:

Predicted values:

[0. 0. 0. ... 0. 0. 0.]
---------------------------------------------------------------------------
Confusion Matrix: 
 [[4961   91]
 [ 625  203]]
---------------------------------------------------------------------------
Accuracy : 
 87.82312925170068
---------------------------------------------------------------------------
Report : 
               precision    recall  f1-score   support

         0.0       0.89      0.98      0.93      5052
         1.0       0.69      0.25      0.36       828

    accuracy                           0.88      5880
   macro avg       0.79      0.61      0.65      5880
weighted avg       0.86      0.88      0.85      5880

---------------------------------------------------------------------------
Results Using Entropy:

Predicted values:

[0. 0. 0. ... 0. 0. 0.]
-------------------------------------------------------------------

