In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [11]:
orig_data = pd.read_csv(r"datasets/Insurance/final_data.csv")  # Loading data

listOfX = [col for col in orig_data.columns if col not in ["CarInsurance", "Id"]]  # List of parameters

orig_data = orig_data.reset_index(drop=True)
orig_data = orig_data.drop(["Id"], axis=1)

print(orig_data.head())
inputData = orig_data[listOfX]  # Getting parameters
outputData = orig_data["CarInsurance"]  # Getting answers

X_train, X_test, y_train, y_test = train_test_split(inputData, outputData, random_state=228, test_size=0.2)  # Spliting data
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

   CallTime  LastContactMonth  PrevAttempts  DaysPassed  HHInsurance  \
0        70                 0             0          -1            1   
1       185                 1             0          -1            1   
2       340                 2             1         119            1   
3       819                 1             0          -1            1   
4       192                 2             0          -1            0   

   NoOfContacts  CarInsurance  
0             2             0  
1             5             0  
2             1             1  
3             2             1  
4             1             0  


In [12]:
X_train

Unnamed: 0,CallTime,LastContactMonth,PrevAttempts,DaysPassed,HHInsurance,NoOfContacts
0,139,5,0,-1,0,2
1,1230,1,0,-1,1,3
2,72,1,0,-1,0,3
3,296,1,0,-1,1,1
4,1080,5,0,-1,0,3
...,...,...,...,...,...,...
3051,96,1,0,-1,1,3
3052,197,5,2,118,0,2
3053,19,10,0,-1,0,1
3054,47,6,12,779,1,1


In [48]:
y_train

0       0
1       1
2       0
3       0
4       1
       ..
3051    0
3052    0
3053    0
3054    0
3055    1
Name: CarInsurance, Length: 3056, dtype: int64

In [96]:
class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.feature_prun = 0.1

    def fit(self, X, y, random_feature = False):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y, random_feature)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]
    
    def _best_split(self, X, y,random_feature):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        for idx in range(self.n_features_):
            if(np.random.randint(0, 11) <= self.feature_prun*10):
                continue
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _grow_tree(self, X, y,random_feature, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(predicted_class=predicted_class)
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y,random_feature)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left,random_feature, depth + 1)
                node.right = self._grow_tree(X_right, y_right,random_feature, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [136]:
dt = DecisionTree(max_depth=10)
dt.fit(X_train.to_numpy(), y_train.to_numpy())

predicted_classes = dt.predict(X_train.to_numpy())
train_accuracy = accuracy_score(predicted_classes, y_train)

predicted_classes = dt.predict(X_test.to_numpy())
test_accuracy = accuracy_score(predicted_classes, y_test)


print(train_accuracy)
print(test_accuracy)


0.8821989528795812
0.7814136125654451


In [134]:
from sklearn import tree

ds_sklearn = tree.DecisionTreeClassifier(max_depth=10)
ds_sklearn = ds_sklearn.fit(X_train, y_train)

In [137]:
predicted_classes = ds_sklearn.predict(X_train.to_numpy())
train_accuracy = accuracy_score(predicted_classes, y_train)

predicted_classes = ds_sklearn.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test)

print("Scikit-learn:")
print(train_accuracy)
print(test_accuracy)


Scikit-learn:
0.8910340314136126
0.7853403141361257
