## Assignment 3 (5th August, 2023)

### Task: 
#### Take any dataset (numerical or categorical). Apply basic generic decision tree algorithm, apply any two spliting criteria and show the difference in evaluation metrics.

In [16]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
data = pd.read_csv("Customers.csv")
data.describe()

Unnamed: 0,CustomerID,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,48.96,110731.8215,50.9625,4.1025,3.7685
std,577.494589,28.429747,45739.536688,27.934661,3.922204,1.970749
min,1.0,0.0,0.0,0.0,0.0,1.0
25%,500.75,25.0,74572.0,28.0,1.0,2.0
50%,1000.5,48.0,110045.0,50.0,3.0,4.0
75%,1500.25,73.0,149092.75,75.0,7.0,5.0
max,2000.0,99.0,189974.0,100.0,17.0,9.0


In [18]:
data.isnull().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

In [19]:
def gini_impurity(labels):
    total_samples = len(labels)
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / total_samples
    gini = 1 - sum(probabilities ** 2)
    return gini

In [20]:
def entropy(labels):
    total_samples = len(labels)
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / total_samples
    entropy = -sum(probabilities * np.log2(probabilities))
    return entropy

In [21]:
def split_data(data, feature, threshold):
    left_data = data[data[feature] <= threshold]
    right_data = data[data[feature] > threshold]
    return left_data, right_data

In [22]:
def calculate_impurity(data, criterion):
    labels = data['Spending Score (1-100)']  # Replace 'Target' with your actual target column name
    if criterion == 'gini':
        impurity = gini_impurity(labels)
    elif criterion == 'entropy':
        impurity = entropy(labels)
    return impurity

In [23]:
def find_best_split(data, criterion):
    best_split = {'feature': None, 'threshold': None, 'impurity': float('inf')}
    
    for feature in data.columns:
        if feature == 'Spending Score (1-100)':  # Skip the target column
            continue
        
        for threshold in data[feature].unique():
            left_data, right_data = split_data(data, feature, threshold)
            total_impurity = (len(left_data) / len(data)) * calculate_impurity(left_data, criterion) + \
                             (len(right_data) / len(data)) * calculate_impurity(right_data, criterion)
            
            if total_impurity < best_split['impurity']:
                best_split['feature'] = feature
                best_split['threshold'] = threshold
                best_split['impurity'] = total_impurity
    
    return best_split

In [24]:
def build_decision_tree(data, criterion, max_depth):
    if max_depth == 0 or len(data['Spending Score (1-100)'].unique()) == 1:
        return data['Spending Score (1-100)'].mode().iloc[0]
    
    best_split = find_best_split(data, criterion)
    if best_split['impurity'] == 0:
        return data['Spending Score (1-100)'].mode().iloc[0]
    
    left_data, right_data = split_data(data, best_split['feature'], best_split['threshold'])
    
    left_subtree = build_decision_tree(left_data, criterion, max_depth - 1)
    right_subtree = build_decision_tree(right_data, criterion, max_depth - 1)
    
    return {'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree}

In [25]:
def evaluate_tree_accuracy(tree, data):
    correct = 0
    total = len(data)
    
    for index, row in data.iterrows():
        prediction = predict(tree, row)
        if prediction == row['Spending Score (1-100)']:
            correct += 1
    
    accuracy = correct / total
    return accuracy

In [26]:
def predict(tree, data):
    if isinstance(tree, dict):
        feature = tree['feature']
        threshold = tree['threshold']
        if data[feature] <= threshold:
            return predict(tree['left'], data)
        else:
            return predict(tree['right'], data)
    else:
        return tree

In [27]:
from sklearn.model_selection import train_test_split

# Assuming 'Target' is the name of the target variable
X = data.drop(columns=['Spending Score (1-100)'])
y = data['Spending Score (1-100)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize lists to store accuracy values for Gini and Entropy criteria
accuracy_gini = []
accuracy_entropy = []

# Evaluate decision trees for different depths and record accuracy
max_depth_range = range(1, 10)  # Adjust the range as needed

In [28]:
for max_depth in max_depth_range:
    # Build decision trees for both criteria
    decision_tree_gini = build_decision_tree(X_train, 'gini', max_depth)
    decision_tree_entropy = build_decision_tree(X_train, 'entropy', max_depth)
    
    # Evaluate and record accuracy for Gini and Entropy criteria
    accuracy_gini.append(evaluate_tree_accuracy(decision_tree_gini, X_test))
    accuracy_entropy.append(evaluate_tree_accuracy(decision_tree_entropy, X_test))

# Plot the accuracy comparison
plt.figure(figsize=(10, 6))
plt.plot(max_depth_range, accuracy_gini, marker='o', label='Gini Impurity')
plt.plot(max_depth_range, accuracy_entropy, marker='o', label='Entropy')
plt.xlabel('Max Depth of Decision Tree')
plt.ylabel('Accuracy')
plt.title('Accuracy Comparison: Gini vs. Entropy')
plt.legend()
plt.grid(True)
plt.show()


KeyError: 'Spending Score (1-100)'