In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
df = pd.read_csv(r"datasets\drug.csv")
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [4]:
label = LabelEncoder()
df['Sex'] = label.fit_transform(df['Sex'])
df['BP'] = label.fit_transform(df['BP'])
df['Cholesterol'] = label.fit_transform(df['Cholesterol'])
df



Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,drugY
1,47,1,1,0,13.093,drugC
2,47,1,1,0,10.114,drugC
3,28,0,2,0,7.798,drugX
4,61,0,1,0,18.043,drugY
...,...,...,...,...,...,...
195,56,0,1,0,11.567,drugC
196,16,1,1,0,12.006,drugC
197,52,1,2,0,9.894,drugX
198,23,1,2,1,14.020,drugX


In [5]:
x = df.drop('Drug',axis=1)
y = df['Drug']

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)



In [7]:
# with no restriction for Height
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test)
dtc.feature_importances_
df1 = pd.DataFrame(data = dtc.feature_importances_,index = x.columns)
df1


Unnamed: 0,0
Age,0.10734
Sex,0.0
BP,0.317269
Cholesterol,0.080934
Na_to_K,0.494457


In [8]:
# maximum level upto 2
dtc1 = DecisionTreeClassifier(criterion='entropy',max_depth=2)
dtc1.fit(x_train,y_train)
y_pred_train = dtc1.predict(x_train)
y_pred_test= dtc1.predict(x_test)

print("train Accuracy :",accuracy_score(y_train,y_pred_train))
print("test Accuracy :",accuracy_score(y_test,y_pred_test))
df2 = pd.DataFrame(data = dtc1.feature_importances_,index = x.columns)
df2


train Accuracy : 0.825
test Accuracy : 0.9


Unnamed: 0,0
Age,0.0
Sex,0.0
BP,0.349451
Cholesterol,0.0
Na_to_K,0.650549


In [9]:
# purity reaches 80percent
dtc2 = DecisionTreeClassifier(criterion='entropy',min_impurity_decrease=0.2)
dtc2.fit(x_train,y_train)
y_pred_test= dtc2.predict(x_test)
print(accuracy_score(y_test,y_pred_test))




0.925


In [10]:
#  10-fold cross validation
# Depth 2
dtc3 = DecisionTreeClassifier(criterion='entropy',max_depth = 2)
from sklearn.model_selection import cross_val_score
accuracy=[]
scores = cross_val_score(dtc3, x_train, y_train, cv=10, scoring='accuracy')
for i in scores:
     accuracy.append(i)
print(sum(accuracy)/len(accuracy))


0.81875


In [11]:
# Depth 3
dtc3 = DecisionTreeClassifier(criterion='entropy',max_depth = 3)
accuracy=[]
scores = cross_val_score(dtc3, x_train, y_train, cv=10, scoring='accuracy')
for i in scores:
     accuracy.append(i)
print(sum(accuracy)/len(accuracy))

0.875


In [12]:
# Depth 4
dtc3 = DecisionTreeClassifier(criterion='entropy',max_depth = 4)
accuracy=[]
scores = cross_val_score(dtc3, x_train, y_train, cv=10, scoring='accuracy')
for i in scores:
     accuracy.append(i)
print(sum(accuracy)/len(accuracy))

0.9875


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, _tree
from sklearn.metrics import accuracy_score

# Load the data
data = pd.read_csv(r'I:\NMK Files\collage\sem5\ML\datasets\drug.csv')

# Prepare features and target
X = data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']]
y = data['Drug']

# Perform one-hot encoding for categorical variables
X = pd.get_dummies(X, columns=['Sex', 'BP', 'Cholesterol'], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class CustomDecisionTreeClassifier(DecisionTreeClassifier):
    def __init__(self, criterion, min_purity):
        super().__init__(criterion=criterion)
        self.min_purity = min_purity

    def fit(self, X, y):
        super().fit(X, y)
        self._prune_tree()

    def _prune_tree(self):
        if not hasattr(self, 'tree_'):
            return

        def prune_node(node):
            if self.tree_.feature[node] == _tree.TREE_UNDEFINED:
                return

            left_child = self.tree_.children_left[node]
            right_child = self.tree_.children_right[node]

            if left_child == right_child:
                return

            # Recursively prune children
            prune_node(left_child)
            prune_node(right_child)

            # Check if the node should be pruned
            if self._should_prune(node):
                self.tree_.feature[node] = _tree.TREE_UNDEFINED
                self.tree_.threshold[node] = -2
                self.tree_.children_left[node] = self.tree_.children_right[node] = _tree.TREE_LEAF

        prune_node(0)  # Start pruning from the root node

    def _should_prune(self, node):
        return self._calculate_purity(node) >= self.min_purity

    def _calculate_purity(self, node):
        total_samples = np.sum(self.tree_.value[node])
        max_class_samples = np.max(self.tree_.value[node])
        return max_class_samples / total_samples if total_samples > 0 else 0

# Create and train the model
clf = CustomDecisionTreeClassifier(criterion='entropy', min_purity=0.8)
clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print the decision tree
print("\nDecision Tree:")
print(export_text(clf, feature_names=list(X.columns)))

Accuracy: 1.0000

Decision Tree:
|--- Na_to_K <= 14.83
|   |--- BP_NORMAL <= 0.50
|   |   |--- BP_LOW <= 0.50
|   |   |   |--- Age <= 50.50
|   |   |   |   |--- class: drugA
|   |   |   |--- Age >  50.50
|   |   |   |   |--- class: drugB
|   |   |--- BP_LOW >  0.50
|   |   |   |--- Cholesterol_NORMAL <= 0.50
|   |   |   |   |--- class: drugC
|   |   |   |--- Cholesterol_NORMAL >  0.50
|   |   |   |   |--- class: drugX
|   |--- BP_NORMAL >  0.50
|   |   |--- class: drugX
|--- Na_to_K >  14.83
|   |--- class: drugY

