In [11]:
import pandas as pd

class MyDecisionTree:
    def __init__(self, max_depth=5):
        # Initialize the decision tree with a maximum depth (default is 5)
        self.max_depth = max_depth

    def cost_function(self, y):
        # Calculate Gini impurity as the cost function for a set of target values 'y'
        unique_values = y.unique()
        total_samples = len(y)
        gini_y = 0
        
        for value in unique_values:
            proportion = len(y[y == value]) / total_samples
            gini_y += proportion**2
        
        gini = 1 - gini_y
        return gini

    def make_split(self, X, y, feature):
        # Find the best split for a given feature
        min_gini = 1
        best_value = None
        for value in X[feature].unique():
            left = y[X[feature] < value]
            gini_left = self.cost_function(left)
            right = y[X[feature] >= value]
            gini_right = self.cost_function(right)
            gini = (len(left) * gini_left + len(right) * gini_right) / len(y)
            if gini < min_gini:
                min_gini = gini
                best_value = value
        return best_value, min_gini

    def find_best_split(self, X, y):
        # Find the best feature to split on in the dataset
        best_feature = None
        best_value = None
        min_gini = 1
        for feature in X.columns:
            value, gini = self.make_split(X, y, feature)
            if gini < min_gini:
                min_gini = gini
                best_feature = feature
                best_value = value
        return best_feature, best_value

    def fit(self, X, y, depth=0):
        # Recursively build the decision tree until max_depth is reached or all samples have the same target value
        if depth == self.max_depth or len(y.unique()) == 1:
            return {'prediction': y.mode()[0]}
        feature, value = self.find_best_split(X, y)
        left_cover = X[feature] < value
        left_tree = self.fit(X[left_cover], y[left_cover], depth + 1)
        right_tree = self.fit(X[~left_cover], y[~left_cover], depth + 1)
        return {'feature': feature, 'value': value,
                'left': left_tree, 'right': right_tree}

    def predict_row(self, row, tree):
        # Predict a single row using the fitted decision tree
        if 'prediction' in tree:
            return tree['prediction']
        feature_value = row[tree['feature']]
        if feature_value < tree['value']:
            return self.predict_row(row, tree['left'])
        else:
            return self.predict_row(row, tree['right'])

    def predict(self, X):
        # Predict a dataset using the fitted decision tree
        predictions = []
        for i in range(len(X)):
            predictions.append(self.predict_row(X.iloc[i], self.tree))
        return pd.Series(predictions)

    def score(self, X, y):
        # Compute accuracy of predictions
        predictions = self.predict(X)
        return (predictions == y).mean()


In [12]:
import pandas as pd

def read_csv_to_dataframe(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print("An error occurred:", e)
        return None

In [13]:
df= read_csv_to_dataframe('/Users/nalinarora/Desktop/ML Assignments/2021478_HW2/Thyroid data - Sheet1.csv')

In [14]:
df.columns


Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
       'label'],
      dtype='object')

In [21]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1] 
y = df.iloc[:, -1] 



num_train_samples = int(0.8 * len(df))

X_train = df.iloc[:num_train_samples, :-1]
y_train = df.iloc[:num_train_samples, -1]
X_test = df.iloc[num_train_samples:, :-1]
y_test = df.iloc[num_train_samples:, -1]


model = MyDecisionTree(max_depth=5)


model.tree = model.fit(X_train, y_train)


predictions = model.predict(X_test)

predictions_train=model.predict(X_train)
y_train=y_train.reset_index(drop=True)
accuracy_train = model.score(X_train, y_train)

y_test = y_test.reset_index(drop=True)

accuracy = model.score(X_test, y_test)
print('Test Accuracy:' +str(accuracy*100))
print('Train Accuracy:'+ str(accuracy_train*100))









Test Accuracy:98.21428571428571
Train Accuracy:99.41964285714285
