# imports

In [83]:
import os
import pandas as pd
from sklearn.impute import KNNImputer
import math
from graphviz import Digraph

## Project_path

In [2]:
project_path = '/content/drive/My Drive/Colab Notebooks/CarClassification'
print("Project directory:", project_path)

Project directory: /content/drive/My Drive/Colab Notebooks/CarClassification


# Load Datasets

In [3]:
dataset = pd.read_csv(f'{project_path}/datasets/original.train_ds.csv')
test = pd.read_csv(f'{project_path}/datasets/original.test_ds.csv')

# Pre Process

## Drop Nulls

In [4]:
def drop_null(dataset) -> pd.DataFrame:
  cleaned_dataset = dataset.dropna()
  return cleaned_dataset

## Fill Numeric Values

In [5]:
def fill_numeric(dataset, method='mean', custom_val=0) -> pd.DataFrame:
  # Impute missing values in numerical columns with the mean
  numeric_dataset = dataset.select_dtypes(include=['number'])
  if(method == 'mean'):
    numeric_dataset.fillna(numeric_dataset.mean(), inplace=True)
  elif(method == 'median'):
    numeric_dataset.fillna(numeric_dataset.median(), inplace=True)
  elif(method == 'mode'):
     numeric_dataset.fillna(numeric_dataset.mode()[0], inplace=True)
  elif(method == 'custom_val'):
    numeric_dataset.fillna(value=custom_val, inplace=True)
  elif(method == 'interpolate_linear'):
    numeric_dataset.interpolate(method='linear', inplace=True)
  elif(method == 'interpolate_time'):
    numeric_dataset.interpolate(method='time', inplace=True)
  elif(method == 'KNN'):
    imputer = KNNImputer(n_neighbors=custom_val)
    data_imputed = imputer.fit_transform(numeric_dataset)
    nemeric_dataset = pd.DataFrame(data_imputed, columns=numeric_dataset.columns)
  else:
    raise Exception('wrong method')
  return numeric_dataset

## Fill Categorical

In [6]:
def fill_categorical(dataset) -> pd.DataFrame:
  categorical_dataset = dataset.select_dtypes(include=['object'])
  for col in categorical_dataset.columns:
      mode_value = categorical_dataset[col].mode()[0]
      categorical_dataset[col].fillna(mode_value, inplace=True)
  return categorical_dataset

## FFILL & BFILL

In [7]:
def forward_fulfill(dataset) -> pd.DataFrame:
  return dataset.fillna(method='ffill')

def backward_fulfill(dataset) -> pd.DataFrame:
  return dataset.fillna(method='bfill')

In [8]:
cleaned_dataset = drop_null(dataset)

# Decision Tree

In [107]:
class Node:
  def __init__(self, data:pd.DataFrame, features: pd.DataFrame.columns, label, value = None) -> None:
     self.data = data
     self.features = features
     self.label = label
     self.value = value

In [157]:
class Tree:
  def __init__(self, root:Node, depth=0, split_method='entropy') -> None:
     self.root = root
     self.depth = depth
     self.split_method = split_method
     self.branches = {}


  def build(self) -> 'Tree':
    features_entropy = {key: 0 for key in self.root.features}
    if len(self.root.features) == 1 or len(self.root.data) == 1:
      print(self.branches)
      self.root.value = self.root.data['model'].value_counts().idxmax()

      # self.root.value = self.root.data
    else:
      for feature in self.root.features:
        if(feature == 'model'):
          feature_entropy = 10
        else:
          feature_entropy = self.entropy(self.root.data[feature].to_list())
        features_entropy[feature] = feature_entropy
      selected_feature = min(features_entropy, key=features_entropy.get)
      if (len(self.root.features)) > 1:
        self.branches = self.split(self.root.data, selected_feature)
      else:
        self.branches = {}
        self.root.features = None
      for branch in self.branches:
        self.branches[branch].build()

  def split(self, data, feature) -> tuple:
    branches = {}
    if(data[feature].dtype == 'object'):
      branches = self.split_catgorical(data, feature)
    elif(self.is_discrete(data[feature])):
      branches = self.split_catgorical(data, feature)
    else:
      branches = self.split_numeric(data, feature)
    return branches


  def split_numeric(self, data, feature):
    threshold = self.calculate_threshold(data[feature])
    branches = {}
    left_data = data[data[feature] <= threshold].drop(feature, axis=1)
    left_node = Node(left_data, left_data.columns, label=f'{feature}.{threshold}.left')
    left_tree = Tree(left_node, depth=self.depth+1, split_method=self.split_method)
    branches[f'{feature}.{threshold}.left'] = left_tree

    right_data = data[data[feature] > threshold].drop(feature, axis=1)
    right_node = Node(right_data, right_data.columns, label=f'{feature}.{threshold}.right')
    right_tree = Tree(right_node, depth=self.depth+1, split_method=self.split_method)
    branches[f'{feature}.{threshold}.right'] = right_tree
    return branches

  def split_catgorical(self, data, feature):
    branches = {}
    for data_point in set(data[feature]):
      new_data = data[data[feature] == data_point].drop(feature, axis=1)
      new_root = Node(new_data, new_data.columns, label=data_point)
      branches[data_point] = Tree(new_root, depth=self.depth+1, split_method=self.split_method)
    return branches

  def calculate_threshold(self, labels:pd.DataFrame.columns):
    threshold = labels.mean()
    return threshold

  def is_discrete(self, labels:pd.DataFrame.columns):
    differences = labels.diff()
    if(labels.dtype == 'int64'):
      largest_divisor = math.gcd(*differences)
      num_unique = labels.nunique()
      if(largest_divisor != None and num_unique < 10):
        return True
    return False

  def entropy(self, labels) -> float:
    labels_count = {key: 0 for key in labels}
    for feature in labels:
      labels_count[feature] = labels_count[feature] + 1
    entropy_value = 0
    for feature in labels_count:
      probability = labels_count[feature] / len(labels)
      log2 = math.log2(probability)
      entropy_value -= probability * math.log2(probability)
    try:
      entropy_value /= math.log2(len(labels_count))

    except Exception as e:
      if(str(e) == 'float division by zero'):
        return 5

    return entropy_value


  def information_gain(self) -> float:
    pass

  def visualize(self):
    dot = Digraph()
    print(self.root.features)
    self._visualize_helper(dot, self.root.label)
    dot.render(f'{project_path}/results/custom_tree', format='png', cleanup=True)

  def _visualize_helper(self, dot, parent_label):
    # if parent_label is None:
    #     parent_label = str(self.data)
    # possible_label = self.root.value if not self.root.value.empty else self.root.features
    if self.root.value is not None and not self.root.value.empty:
      possible_label = self.root.value
    else:
      possible_label = self.root.features
    dot.node(parent_label, label=str(self.root.label))
    for branch in self.branches:
      # child_label = branch
      self.branches[branch]._visualize_helper(dot, branch)
      dot.edge(parent_label, branch)
    # for i, child in enumerate(self.children):
    #     child_label = f"{parent_label}_{i}"
    #     child._visualize_helper(dot, parent_label=child_label)
    #     dot.edge(parent_label, child_label)





In [158]:
node = Node(cleaned_dataset ,cleaned_dataset.columns, 'root')
tree = Tree(node)
tree.build()
tree.visualize()

{}
 3 Series
**
{}
 Yaris
**
{}
 Yaris
**
{}
 C Class
**
{}
 C Class
**
{}
 3 Series
**
{}
 3 Series
**
{}
 Yaris
**
{}
 Yaris
**
{}
 Q5
**
{}
 Golf
**
{}
 Golf
**
{}
 E Class
**
{}
 Polo
**
{}
 Up
**
{}
 Polo
**
{}
 Polo
**
{}
 Polo
**
{}
 Polo
**
{}
 Polo
**
{}
 Polo
**
{}
 Up
**
{}
 Up
**
{}
 Up
**
{}
 Up
**
{}
 Up
**
{}
 Up
**
{}
 Fabia
**
{}
 Fabia
**
{}
 Aygo
**
{}
 Aygo
**
{}
 Aygo
**
{}


ValueError: attempt to get argmax of an empty sequence

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import math

class Node:
    def __init__(self, feature=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature = feature  # Feature to split on
        self.threshold = threshold  # Threshold for continuous features
        self.value = value  # Value if the node is a leaf node
        self.true_branch = true_branch  # True branch (left)
        self.false_branch = false_branch  # False branch (right)

def entropy(labels):
    label_counts = Counter(labels)
    num_instances = len(labels)
    entropy_value = 0
    for count in label_counts.values():
        probability = count / num_instances
        entropy_value -= probability * math.log2(probability)
    return entropy_value

def gain_information(data, feature, threshold):
    true_labels = data[data[feature] <= threshold]['label']
    false_labels = data[data[feature] > threshold]['label']
    true_entropy = entropy(true_labels)
    false_entropy = entropy(false_labels)
    true_weight = len(true_labels) / len(data)
    false_weight = len(false_labels) / len(data)
    gain = entropy(data['label']) - (true_weight * true_entropy + false_weight * false_entropy)
    return gain

def find_best_split(data, features):
    best_gain = 0
    best_feature = None
    best_threshold = None
    for feature in features:
        if data[feature].dtype == 'O':  # Categorical feature
            categories = data[feature].unique()
            for category in categories:
                gain = gain_information(data, feature, category)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = category
        else:  # Continuous feature
            thresholds = sorted(data[feature].unique())
            for i in range(len(thresholds) - 1):
                threshold = (thresholds[i] + thresholds[i+1]) / 2
                gain = gain_information(data, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
    return best_feature, best_threshold

def build_tree(data, features):
    # If all labels are the same, return a leaf node
    if len(set(data['label'])) == 1:
        return Node(value=data['label'].iloc[0])

    # If there are no features left to split, return the majority class
    if len(features) == 0:
        majority_class = Counter(data['label']).most_common(1)[0][0]
        return Node(value=majority_class)

    # Find the best feature and threshold to split on
    best_feature, best_threshold = find_best_split(data, features)

    # Split the data based on the best feature and threshold
    if data[best_feature].dtype == 'O':  # Categorical feature
        true_data = data[data[best_feature] == best_threshold]
        false_data = data[data[best_feature] != best_threshold]
    else:  # Continuous feature
        true_data = data[data[best_feature] <= best_threshold]
        false_data = data[data[best_feature] > best_threshold]

    # Recursively build the true and false branches
    true_branch = build_tree(true_data, features)
    false_branch = build_tree(false_data, features)

    # Return a node with the best feature, threshold, and branches
    return Node(feature=best_feature, threshold=best_threshold, true_branch=true_branch, false_branch=false_branch)

# Example usage:
# Assuming you have a DataFrame called 'df' with features and labels
# features = df.columns[:-1]
# root = build_tree(df, features)



import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(0)
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'label': np.random.choice([0, 1], size=100)
})

# # Display the first few rows of the dataset
# print("Sample Data:")
# print(data.head())

# Define the features
features = ['feature1', 'feature2']

# Build the decision tree
# print(type(data), data)
# print(type(features), features)
# print(type(cleaned_dataset), cleaned_dataset)
# print(type(cleaned_dataset.columns.to_list()), cleaned_dataset.columns.to_list())
# sfsf
print(type(data), type(features))
root = build_tree(data, features)

# Function to print the decision tree
def print_tree(node, depth=0):
    if node.value is not None:
        print(f"{'  ' * depth}Leaf Node: Predicted Value = {node.value}")
    else:
        print(f"{'  ' * depth}Split on {node.feature} <= {node.threshold}")
        print_tree(node.true_branch, depth + 1)
        print_tree(node.false_branch, depth + 1)

# Print the decision tree
print("\nDecision Tree:")
print_tree(root)


<class 'pandas.core.frame.DataFrame'> <class 'list'>

Decision Tree:
Split on feature2 <= 0.653663096952271
  Split on feature1 <= 1.190579709972031
    Split on feature2 <= -1.304382160117485
      Split on feature1 <= 0.43146973194649874
        Leaf Node: Predicted Value = 1
        Split on feature1 <= 0.6818043712399806
          Leaf Node: Predicted Value = 0
          Leaf Node: Predicted Value = 1
      Split on feature2 <= -1.1259259959197765
        Leaf Node: Predicted Value = 0
        Split on feature2 <= -1.09623114965735
          Leaf Node: Predicted Value = 1
          Split on feature2 <= -0.9565187864168616
            Leaf Node: Predicted Value = 0
            Split on feature1 <= 0.04885195654879251
              Split on feature1 <= -0.1925415497890759
                Split on feature2 <= -0.8149241014164277
                  Leaf Node: Predicted Value = 1
                  Split on feature1 <= -2.266893142029003
                    Leaf Node: Predicted Value = 1
