# imports

In [178]:
import os
import pandas as pd
from sklearn.impute import KNNImputer
import math
import numpy as np
from graphviz import Digraph
import random

## Project_path

In [3]:
project_path = '/content/drive/My Drive/Colab Notebooks/CarClassification'
print("Project directory:", project_path)

Project directory: /content/drive/My Drive/Colab Notebooks/CarClassification


# Load Datasets

In [38]:
dataset = pd.read_csv(f'{project_path}/datasets/original.train_ds.csv')
test = pd.read_csv(f'{project_path}/datasets/original.test_ds.csv')

# for col in dataset.select_dtypes(include='float'):
#     if (dataset[col] % 1 == 0).all():  # Check if all values are integers
#         print('here')
#         dataset[col] = dataset[col].astype(int)

# print(dataset)
# for col in dataset.select_dtypes(include='float'):
#     non_zero_decimal = dataset[col] % 1 != 0
#     if non_zero_decimal.any():
#         print(f"Column '{col}' has non-integer values:")
#         print(dataset.loc[non_zero_decimal, col])

# print(dataset.dtypes)

# Pre Process

## Drop Nulls

In [5]:
def drop_null(dataset) -> pd.DataFrame:
  cleaned_dataset = dataset.dropna()
  return cleaned_dataset

## Fill Numeric Values

In [6]:
def fill_numeric(dataset, method='mean', custom_val=0) -> pd.DataFrame:
  # Impute missing values in numerical columns with the mean
  numeric_dataset = dataset.select_dtypes(include=['number'])
  if(method == 'mean'):
    numeric_dataset.fillna(numeric_dataset.mean(), inplace=True)
  elif(method == 'median'):
    numeric_dataset.fillna(numeric_dataset.median(), inplace=True)
  elif(method == 'mode'):
     numeric_dataset.fillna(numeric_dataset.mode()[0], inplace=True)
  elif(method == 'custom_val'):
    numeric_dataset.fillna(value=custom_val, inplace=True)
  elif(method == 'interpolate_linear'):
    numeric_dataset.interpolate(method='linear', inplace=True)
  elif(method == 'interpolate_time'):
    numeric_dataset.interpolate(method='time', inplace=True)
  elif(method == 'KNN'):
    imputer = KNNImputer(n_neighbors=custom_val)
    data_imputed = imputer.fit_transform(numeric_dataset)
    nemeric_dataset = pd.DataFrame(data_imputed, columns=numeric_dataset.columns)
  else:
    raise Exception('wrong method')
  return numeric_dataset

## Fill Categorical

In [7]:
def fill_categorical(dataset) -> pd.DataFrame:
  categorical_dataset = dataset.select_dtypes(include=['object'])
  for col in categorical_dataset.columns:
      mode_value = categorical_dataset[col].mode()[0]
      categorical_dataset[col].fillna(mode_value, inplace=True)
  return categorical_dataset

## FFILL & BFILL

In [8]:
def forward_fulfill(dataset) -> pd.DataFrame:
  return dataset.fillna(method='ffill')

def backward_fulfill(dataset) -> pd.DataFrame:
  return dataset.fillna(method='bfill')

In [40]:
# cleaned_dataset =  pd.concat([fill_categorical(dataset), fill_numeric(dataset, method='interpolate_linear')], axis=1)
cleaned_dataset = drop_null(dataset)
for col in cleaned_dataset.select_dtypes(include='float'):
    non_zero_decimal = cleaned_dataset[col] % 1 != 0
    if non_zero_decimal.any():
        print(f"Column '{col}' has non-integer values:")
        print(cleaned_dataset.loc[non_zero_decimal, col])

for col in cleaned_dataset.select_dtypes(include='float'):
    if (cleaned_dataset[col] % 1 == 0).all():  # Check if all values are integers
        print('here')
        cleaned_dataset[col] = cleaned_dataset[col].astype(int)
print(cleaned_dataset.dtypes)
# cleaned_test = pd.concat([fill_categorical(test), fill_numeric(test)], axis=1)
# cleaned_test = test


cleaned_test = drop_null(test)

Column 'mpg' has non-integer values:
1       64.2
7       58.9
11      42.2
12      51.4
13      47.1
        ... 
2379    48.7
2383    51.4
2386    44.8
2393    64.2
2396    64.2
Name: mpg, Length: 937, dtype: float64
Column 'engineSize' has non-integer values:
7       1.5
11      1.6
13      1.4
16      1.7
17      1.4
       ... 
2366    1.3
2372    1.5
2379    1.2
2383    1.4
2386    1.5
Name: engineSize, Length: 434, dtype: float64
here
here
here
here
model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
Manufacturer     object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset[col] = cleaned_dataset[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset[col] = cleaned_dataset[col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset[col] = cleaned_dataset[col].astype(int)
A value is trying to be set on a copy

# Decision Tree

In [81]:
class Node:
  def __init__(self, data:pd.DataFrame, features: pd.DataFrame.columns, label, discerete_features={}, value = None) -> None:
     self.data = data
     self.features = features
     self.label = label
     self.value = value
     self.discerete_features = discerete_features

In [188]:
def is_discrete(labels:pd.DataFrame.columns):
  if(labels.dtype == 'int64'):
    num_unique = labels.nunique()
    if(num_unique < 20):
      return True
  return False

def random_outcome(labels):
  model_counts = labels.value_counts()
  total_count = model_counts.sum()
  probabilities = model_counts / total_count
  outcomes = list(zip(probabilities.index, probabilities))
  random_number = random.random()

  cumulative_probability = 0
  for outcome, probability in outcomes:
    cumulative_probability += probability
    if random_number < cumulative_probability:
        return outcome
        break

In [201]:
discrete_features = {feature: False for feature in cleaned_dataset.columns}
discrete_features['year'] = True
# discrete_features = {feature: is_discrete(cleaned_dataset[feature]) for feature in cleaned_dataset.columns}
print(discrete_features)

{'model': False, 'year': True, 'price': False, 'transmission': False, 'mileage': False, 'fuelType': False, 'tax': False, 'mpg': False, 'engineSize': False, 'Manufacturer': False}


In [202]:
class Tree:
  def __init__(self, root:Node, depth=0, split_method='entropy') -> None:
     self.root = root
     self.depth = depth
     self.split_method = split_method
     self.branches = {}
     self.features_ig = {key: 0 for key in self.root.features}

  def build(self) -> 'Tree':
    features_entropy = {key: 0 for key in self.root.features}
    features_ig = {key: 0 for key in self.root.features}

    if len(self.root.features) == 1 and len(self.root.data) > 1:
      self.root.value = random_outcome(self.root.data['model'])
      # self.root.value = self.root.data['model'].value_counts().idxmax()
    elif len(self.root.data) == 1:
      self.root.value = self.root.data['model'].value_counts().idxmax()
    else:
      for feature in self.root.features:
        if(feature == 'model'):
          feature_entropy = 10
        else:
          feature_entropy = self.entropy(self.root.data[feature].to_list())
        features_entropy[feature] = feature_entropy

      selected_feature = min(features_entropy, key=features_entropy.get)
      self.branches = self.split(self.root.data, selected_feature)
      for branch in self.branches:
        self.branches[branch].build()

  def split(self, data, feature) -> tuple:
    branches = {}
    if(data[feature].dtype == 'object'):
      branches = self.split_categorical(data, feature)
    elif(self.root.discerete_features[feature]):
      branches = self.split_categorical(data, feature)
    else:
      branches = self.split_numeric(data, feature)
    return branches


  def split_numeric(self, data, feature):
    threshold = data[feature].mean()
    branches = {}
    left_data = data[data[feature] <= threshold].drop(feature, axis=1)
    left_node = Node(left_data, left_data.columns, label=f'{feature}.{threshold}.left', discerete_features=self.root.discerete_features)
    left_tree = Tree(left_node, depth=self.depth+1, split_method=self.split_method)
    branches[f'{feature}*{threshold}*left'] = left_tree

    right_data = data[data[feature] > threshold].drop(feature, axis=1)
    right_node = Node(right_data, right_data.columns, label=f'{feature}.{threshold}.right', discerete_features=self.root.discerete_features)
    right_tree = Tree(right_node, depth=self.depth+1, split_method=self.split_method)
    branches[f'{feature}*{threshold}*right'] = right_tree
    return branches

  def split_categorical(self, data, feature):
    branches = {}
    for data_point in set(data[feature]):
      new_data = data[data[feature] == data_point].drop(feature, axis=1)
      new_root = Node(new_data, new_data.columns, label=data_point, discerete_features=self.root.discerete_features)
      branches[f'{feature}*{data_point}'] = Tree(new_root, depth=self.depth+1, split_method=self.split_method)
    return branches

  def entropy(self, labels) -> float:
    labels_count = {key: 0 for key in labels}
    for feature in labels:
      labels_count[feature] = labels_count[feature] + 1
    entropy_value = 0
    for feature in labels_count:
      probability = labels_count[feature] / len(labels)
      log2 = math.log2(probability)
      entropy_value -= probability * math.log2(probability)
    try:
      entropy_value /= math.log2(len(labels_count))

    except Exception as e:
      if(str(e) == 'float division by zero'):
        return 5

    return entropy_value


  def visualize(self):
    dot = Digraph()
    # print(self.root.features)
    self._visualize_helper(dot, self.root.label)
    dot.render(f'{project_path}/results/custom_tree', format='png', cleanup=True)

  def _visualize_helper(self, dot, parent_label):
    print(self.root.value)
    if self.root.value is not None :
      possible_label = self.root.value
    else:
      possible_label = self.root.features
    dot.node(parent_label, label=str(self.root.label))
    for branch in self.branches:
      self.branches[branch]._visualize_helper(dot, branch)
      dot.edge(parent_label, branch)


  def predict(self, data_point: pd.DataFrame):
    if self.root.value is not None:
      return self.root.value
    else:
      for branch in self.branches:
        if branch.endswith('left'):
          branch_parts = branch.split('*')
          feature = branch_parts[0]
          threshold = float(branch_parts[1])
          if(data_point.iloc[0][feature] <= threshold):
            # print(data_point)
            # print(branch)
            new_data_point = data_point.drop(feature, axis=1)
            return self.branches[branch].predict(new_data_point)

        elif branch.endswith('right'):
          branch_parts = branch.split('*')
          feature = branch_parts[0]
          threshold = float(branch_parts[1])
          if(data_point.iloc[0][feature] > threshold):
            # print(data_point)
            # print(branch)
            new_data_point = data_point.drop(feature, axis=1)
            return self.branches[branch].predict(new_data_point)
        else:
          branch_parts = branch.split('*')
          feature = branch_parts[0]
          feature_value = branch_parts[1]
          if str(data_point.iloc[0][feature]) == str(feature_value):
            # print(data_point)
            # print(branch)
            new_data_point = data_point.drop(feature, axis=1)
            return self.branches[branch].predict(new_data_point)


In [203]:

node = Node(cleaned_dataset ,cleaned_dataset.columns, 'root', discrete_features)
tree = Tree(node)
# tree.calculate_ig()
tree.build()



In [204]:
test_pd = cleaned_test.iloc[[1]].copy()
# test_pd = cleaned_dataset.iloc[[2]].copy()
print(test_pd)
result = tree.predict(test_pd)
print(result, test_pd.iloc[0]['model'])

       model  year  price transmission  mileage fuelType  tax   mpg  \
1   2 Series  2019  24590    Semi-Auto     3300   Diesel  145  48.7   

   engineSize Manufacturer  
1         2.0          BMW  
 4 Series  2 Series


In [205]:
count = 0
for i in range(len(cleaned_dataset)):
  test_pd = cleaned_dataset.iloc[[i]].copy()
  # test_pd = cleaned_dataset.iloc[[2]].copy()
  # print(test_pd)
  result = tree.predict(test_pd)
  # print(result, test_pd.iloc[0]['model'])
  if result == test_pd.iloc[0]['model']:
    count+=1
print(count/ len(cleaned_dataset))
# print(count)

0.9676409185803758


In [206]:
count = 0
for i in range(len(cleaned_test)):
  test_pd = cleaned_test.iloc[[i]].copy()
  # test_pd = cleaned_dataset.iloc[[2]].copy()
  # print(test_pd)
  result = tree.predict(test_pd)
  # print(result, test_pd.iloc[0]['model'])
  if result == test_pd.iloc[0]['model']:
    count+=1
print(count/ len(cleaned_test))

0.4716666666666667


In [None]:
tree.visualize()

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import math

class Node:
    def __init__(self, feature=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature = feature  # Feature to split on
        self.threshold = threshold  # Threshold for continuous features
        self.value = value  # Value if the node is a leaf node
        self.true_branch = true_branch  # True branch (left)
        self.false_branch = false_branch  # False branch (right)

def entropy(labels):
    label_counts = Counter(labels)
    num_instances = len(labels)
    entropy_value = 0
    for count in label_counts.values():
        probability = count / num_instances
        entropy_value -= probability * math.log2(probability)
    return entropy_value

def gain_information(data, feature, threshold):
    true_labels = data[data[feature] <= threshold]['label']
    false_labels = data[data[feature] > threshold]['label']
    true_entropy = entropy(true_labels)
    false_entropy = entropy(false_labels)
    true_weight = len(true_labels) / len(data)
    false_weight = len(false_labels) / len(data)
    gain = entropy(data['label']) - (true_weight * true_entropy + false_weight * false_entropy)
    return gain

def find_best_split(data, features):
    best_gain = 0
    best_feature = None
    best_threshold = None
    for feature in features:
        if data[feature].dtype == 'O':  # Categorical feature
            categories = data[feature].unique()
            for category in categories:
                gain = gain_information(data, feature, category)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = category
        else:  # Continuous feature
            thresholds = sorted(data[feature].unique())
            for i in range(len(thresholds) - 1):
                threshold = (thresholds[i] + thresholds[i+1]) / 2
                gain = gain_information(data, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
    return best_feature, best_threshold

def build_tree(data, features):
    # If all labels are the same, return a leaf node
    if len(set(data['label'])) == 1:
        return Node(value=data['label'].iloc[0])

    # If there are no features left to split, return the majority class
    if len(features) == 0:
        majority_class = Counter(data['label']).most_common(1)[0][0]
        return Node(value=majority_class)

    # Find the best feature and threshold to split on
    best_feature, best_threshold = find_best_split(data, features)

    # Split the data based on the best feature and threshold
    if data[best_feature].dtype == 'O':  # Categorical feature
        true_data = data[data[best_feature] == best_threshold]
        false_data = data[data[best_feature] != best_threshold]
    else:  # Continuous feature
        true_data = data[data[best_feature] <= best_threshold]
        false_data = data[data[best_feature] > best_threshold]

    # Recursively build the true and false branches
    true_branch = build_tree(true_data, features)
    false_branch = build_tree(false_data, features)

    # Return a node with the best feature, threshold, and branches
    return Node(feature=best_feature, threshold=best_threshold, true_branch=true_branch, false_branch=false_branch)

# Example usage:
# Assuming you have a DataFrame called 'df' with features and labels
# features = df.columns[:-1]
# root = build_tree(df, features)



import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(0)
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'label': np.random.choice([0, 1], size=100)
})

# # Display the first few rows of the dataset
# print("Sample Data:")
# print(data.head())

# Define the features
features = ['feature1', 'feature2']

# Build the decision tree
# print(type(data), data)
# print(type(features), features)
# print(type(cleaned_dataset), cleaned_dataset)
# print(type(cleaned_dataset.columns.to_list()), cleaned_dataset.columns.to_list())
# sfsf
print(type(data), type(features))
root = build_tree(data, features)

# Function to print the decision tree
def print_tree(node, depth=0):
    if node.value is not None:
        print(f"{'  ' * depth}Leaf Node: Predicted Value = {node.value}")
    else:
        print(f"{'  ' * depth}Split on {node.feature} <= {node.threshold}")
        print_tree(node.true_branch, depth + 1)
        print_tree(node.false_branch, depth + 1)

# Print the decision tree
print("\nDecision Tree:")
print_tree(root)
