In [None]:
import pandas as pd
import numpy as np
import math
import graphviz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path="/content/drive/MyDrive/MachineLearning/13-03-2023/airfoil_self_noise.csv"

In [None]:
data=pd.read_csv(path)
data

Unnamed: 0,x0,x1,x2,x3,x4,y
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


In [None]:
class Node():
  def __init__(self, name, value):
    self.name = name
    self.value = value
    self.children = []
    self.label=[]
        
  def add_child(self, node):
    self.children.append(node)

  def add_label(self, label):
    self.label.append(label)

  def print_node(self):
    print(self.name or self.value)
    for i in self.children:
      i.print_node()  

In [None]:

class RegressionTree:
    def __init__(self, dataset, target_feature):
      # Initialize an empty list of labels for visualization and construct the decision tree
        self.root = self.construct(dataset, target_feature)

    def construct(self, dataset, target_feature):
        # If the dataset is empty, return a leaf node with no value
        if len(dataset) == 0:
            return Node(None, None)
        # If all the values in the target feature are the same, return a leaf node with that value
        if len(dataset[target_feature].unique()) == 1:
            return Node(None, dataset[target_feature].iloc[0])
        # If there are no more features to split on, return a leaf node with the most common value in the target feature
        if len(dataset.columns) == 1:
            return Node(None, dataset[target_feature].mean())


        num_features=len(dataset.columns)-1
        # Initialize variables to keep track of the best feature to split on and the highest information gain
        best= self.get_best_split(dataset, num_features)
        best_feature=dataset.columns[best["feature_index"]]
        val=best["threshold"]
        # check if information gain is positive
        root_node = Node(best_feature, val)
        # print("II")

        if best["var_red"]>0:
          # Create a new root node with the best feature and highest information gain
          # Split the dataset into subsets based on the values of the best feature
          splits = self.split(dataset, best["feature_index"],val)
          # Iterate over the subsets and construct a decision tree for each one
          for i, s in enumerate(splits):
              # Create a new dataset without the best feature
              new_dataset = s.drop(best_feature, axis=1)
              # Construct a decision tree for this subset
              child = self.construct(new_dataset, target_feature)
              # Add the label for this split to the list of labels for the root node
              root_node.add_label(str(i))
              # Add the child tree as a child of the root node
              root_node.add_child(child)
        # Return the root node
        return root_node


    def get_best_split(self, dataset, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset.iloc[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset.iloc[:, -1:], dataset_left.iloc[:, -1:], dataset_right.iloc[:, -1:]
                    # print(y,left_y,right_y)
                    # compute information gain
                    curr_var_red = self.SSE(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red<max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        # best_split[1] = dataset_left
                        # best_split[2] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        # for r in dataset:
        #   print(r)
        # print(type(threshold))
        # print(dataset.loc[dataset[dataset.columns[feature_index]]<=threshold])
        dataset_left = dataset.loc[dataset[dataset.columns[feature_index]]<=threshold]
        dataset_right = dataset.loc[dataset[dataset.columns[feature_index]]>threshold]
        # print(dataset_left)
        return dataset_left, dataset_right
    
    def SSE(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        reduction = (len(l_child) * np.var(l_child) + len(r_child) * np.var(r_child))
        # print(reduction)
        return float(reduction)

    def predict(self,test):
        predictions=[]
        for _,row in test.iterrows():
          node=self.root
          while(node.name):
            feature=node.name
            val=row[feature]
            if val<=node.value:
                node=node.children[0]
            else:
                node=node.children[1]
          predictions.append(node.value)
        return predictions

    # def accuracy(self,actual,predicted):
    #     actual=list(actual)
    #     predicted=list(predicted)
    #     correct=0
    #     for i in range(len(actual)):
    #       if actual[i]==predicted[i]:
    #         correct+=1
        
    #     return (correct/len(actual))*100




In [None]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [None]:
reg=RegressionTree(data,"y")


In [None]:
Y_pred = reg.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))

In [None]:
import numpy as np

class RegressionTree:
    
    def __init__(self, min_samples_split=2, max_depth=2):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.tree = {}
        
    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)
        
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        # Stopping criteria
        if (n_samples >= self.min_samples_split and
            depth <= self.max_depth and
            n_labels != 1):
            
            # Find the best split
            feature_idxs = np.random.choice(n_features, 1, replace=False)
            best_feature, best_threshold = self._best_split(X, y, feature_idxs)
            
            # Split the data
            left_idxs = X[:, best_feature] < best_threshold
            right_idxs = X[:, best_feature] >= best_threshold
            
            left = self._grow_tree(X[left_idxs], y[left_idxs], depth+1)
            right = self._grow_tree(X[right_idxs], y[right_idxs], depth+1)
            
            return {'feature': best_feature,
                    'threshold': best_threshold,
                    'left': left,
                    'right': right}
        
        else:
            leaf_value = self._leaf_value(y)
            return {'leaf_value': leaf_value}
    
    def _best_split(self, X, y, feature_idxs):
        best_gain = -np.inf
        split_idx, split_threshold = None, None
        
        for feature_idx in feature_idxs:
            thresholds = np.unique(X[:, feature_idx])
            
            for threshold in thresholds:
                left_idxs = X[:, feature_idx] < threshold
                right_idxs = X[:, feature_idx] >= threshold
                
                if len(y[left_idxs]) > 0 and len(y[right_idxs]) > 0:
                    gain = self._split_gain(y, y[left_idxs], y[right_idxs])
                    
                    if gain > best_gain:
                        best_gain = gain
                        split_idx = feature_idx
                        split_threshold = threshold
                        
        return split_idx, split_threshold
    
    def _split_gain(self, parent, left, right):
        weighted_parent = len(parent) / (len(left) + len(right))
        variance_parent = np.var(parent)
        variance_left = np.var(left) if len(left) > 0 else 0
        variance_right = np.var(right) if len(right) > 0 else 0
        
        return weighted_parent * (variance_parent - 
                                   variance_left - 
                                   variance_right)
    
    def _leaf_value(self, y):
        return np.mean(y)
    
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])
    
    def _traverse_tree(self, x, node):
        if 'leaf_value' in node:
            return node['leaf_value']
        
        if x[node['feature']] < node['threshold']:
            return self._traverse_tree(x, node['left'])
        else:
            return self._traverse_tree(x, node['right'])


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [None]:

# Generate synthetic data
X, y = make_regression(n_samples=100, n_features=5, noise=5, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a regression tree with max depth of 3
tree = RegressionTree(max_depth=3)

# Fit the regression tree to the training data
tree.fit(X_train, y_train)

# Make predictions on the test data
y_pred = tree.predict(X_test)

# Calculate the mean squared error
mse = np.mean((y_test - y_pred)**2)
print(f"Mean Squared Error: {mse:.2f}")

# Plot the true vs predicted values
plt.scatter(y_test, y_pred)
plt.plot([-200, 200], [-200, 200], 'r--')
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.show()

In [None]:
def construct(self, dataset, target_feature):
    # If the dataset is empty, return a leaf node with no value
    if len(dataset) == 0:
        return Node(None, None)
    # If there are no more features to split on, return a leaf node with the mean value of the target feature
    if len(dataset.columns) == 1:
        return Node(None, dataset[target_feature].mean())

    # Initialize variables to keep track of the best feature to split on and the lowest mean squared error
    min_mse = float('inf')
    best_feature = None

    # Calculate the mean squared error for each feature
    for feature in dataset.columns:
        # Skip the target feature
        if feature == target_feature:
            continue
        mse = self.mean_squared_error(dataset, feature, target_feature)
        # If this feature has a lower mean squared error, update the best feature and the lowest mean squared error
        if mse < min_mse:
            min_mse = mse
            best_feature = feature

    # Create a new root node with the best feature and lowest mean squared error
    root_node = Node(best_feature, min_mse)
    # Split the dataset into subsets based on the values of the best feature
    splits = self.splitbycolumn(dataset, best_feature)
    # Iterate over the subsets and construct a regression tree for each one
    for i, split in splits.items():
        # Create a new dataset without the best feature
        new_dataset = split.drop(best_feature, axis=1)
        # Construct a regression tree for this subset
        child = self.construct(new_dataset, target_feature)
        # Add the label for this split to the list of labels for the root node
        root_node.add_label(str(i))
        # Add the child tree as a child of the root node
        root_node.add_child(child)
    # Return the root node
    return root_node

def splitbycolumn(self, dataset, feature):
    # Create a dictionary to store the splits
    split = {}
    # Get the unique values of the feature
    d = dataset[feature].unique()
    # Split the dataset into subsets based on the values of the feature
    for val in d:
        split[val] = dataset.loc[dataset[feature] == val]
    # Return the dictionary of splits
    return split