<a href="https://colab.research.google.com/github/Mercykiminza/AI/blob/main/Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')

# Fill missing values in 'Age' with the median, and drop rows with missing 'Embarked'
data['Age'].fillna(data['Age'].median(), inplace=True)
data.dropna(subset=['Embarked'], inplace=True)

# Fill missing 'Fare' (if any) with the median value
data['Fare'].fillna(data['Fare'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Fare'].fillna(data['Fare'].median(), inplace=True)


**Convert categorical variables to numerical**

In [3]:
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)


**Select relevant features**

In [4]:
X = data[['Pclass', 'Sex_male', 'Age', 'SibSp', 'Parch', 'Fare']]
y = data['Survived']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Implementing the Decision Tree Algorithm**

In [5]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        return [self._predict_single(sample, self.tree) for _, sample in X.iterrows()]

    def _calculate_entropy(self, y):
        proportions = np.bincount(y) / len(y)
        return -np.sum([p * np.log2(p) for p in proportions if p > 0])

    def _split_data(self, X, y, feature, threshold):
        left_mask = X[feature] <= threshold
        right_mask = X[feature] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _calculate_information_gain(self, parent, left_child, right_child):
        p_left = len(left_child) / len(parent)
        p_right = len(right_child) / len(parent)
        gain = self._calculate_entropy(parent) - (p_left * self._calculate_entropy(left_child) + p_right * self._calculate_entropy(right_child))
        return gain

    def _find_best_split(self, X, y):
        best_gain = -1
        best_split = None
        n_features = X.shape[1]
        for feature in X.columns:
            thresholds = X[feature].unique()
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split_data(X, y, feature, threshold)
                if len(y_left) > 0 and len(y_right) > 0:
                    gain = self._calculate_information_gain(y, y_left, y_right)
                    if gain > best_gain:
                        best_gain = gain
                        best_split = {'feature': feature, 'threshold': threshold, 'left': (X_left, y_left), 'right': (X_right, y_right)}
        return best_split

    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1:
            return y.iloc[0]
        if self.max_depth is not None and depth >= self.max_depth:
            return y.mode()[0]
        split = self._find_best_split(X, y)
        if split is None:
            return y.mode()[0]
        left_subtree = self._build_tree(split['left'][0], split['left'][1], depth + 1)
        right_subtree = self._build_tree(split['right'][0], split['right'][1], depth + 1)
        return {'feature': split['feature'], 'threshold': split['threshold'], 'left': left_subtree, 'right': right_subtree}

    def _predict_single(self, sample, tree):
        if not isinstance(tree, dict):
            return tree
        if sample[tree['feature']] <= tree['threshold']:
            return self._predict_single(sample, tree['left'])
        else:
            return self._predict_single(sample, tree['right'])


**Training and Testing**

In [7]:
# Initialize the DecisionTreeClassifier and train
clf = DecisionTreeClassifier(max_depth=3)  # You can experiment with max_depth
clf.fit(X_train, y_train)

# Predict on test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 82.02%



**Visualization (Bonus)**

In [9]:
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print(f"{'  '*depth}Feature: {node['feature']} <= {node['threshold']}")
        print(f"{'  '*depth}Left:")
        print_tree(node['left'], depth + 1)
        print(f"{'  '*depth}Right:")
        print_tree(node['right'], depth + 1)
    else:
        print(f"{'  '*depth}Leaf: {node}")

# Print the trained decision tree
print_tree(clf.tree)


Feature: Sex_male <= False
Left:
  Feature: Pclass <= 2
  Left:
    Feature: Fare <= 28.7125
    Left:
      Leaf: 1
    Right:
      Leaf: 1
  Right:
    Feature: Fare <= 23.25
    Left:
      Leaf: 1
    Right:
      Leaf: 0
Right:
  Feature: Fare <= 26.25
  Left:
    Feature: Age <= 12.0
    Left:
      Leaf: 1
    Right:
      Leaf: 0
  Right:
    Feature: SibSp <= 2
    Left:
      Leaf: 0
    Right:
      Leaf: 0
