In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans

In [2]:
class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, var_red=None, variance=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.var_red = var_red  
        self.variance = variance 

class DecisionTreeRegressor:
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, dataset, curr_depth=0, parent_variance=None):
        X, Y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = np.shape(X)
        
        if parent_variance is None:
            parent_variance = np.var(Y)
        
        best_split = {}
        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            feature_indices = np.array(range(num_features))
            best_split = self.get_best_split(dataset, num_samples, feature_indices, parent_variance)
            if best_split["var_red"] > 0: 
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1, best_split["left_variance"])
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1, best_split["right_variance"])
                return Node(
                    feature_index=best_split["feature_index"], 
                    threshold=best_split["threshold"], 
                    left=left_subtree, 
                    right=right_subtree, 
                    var_red=best_split["var_red"],
                    variance=parent_variance
                )
        
        # Create a leaf node if no valid split is found
        leaf_value = self.calculate_leaf_value(Y)
        return Node(value=leaf_value, variance=parent_variance)

    def get_best_split(self, dataset, num_samples, feature_indices, parent_variance):
        best_split = {
            "feature_index": None,
            "threshold": None,
            "dataset_left": None,
            "dataset_right": None,
            "var_red": -float("inf"),
            "left_variance": None,
            "right_variance": None
        }

        y = dataset[:, -1]
        total_samples = len(y)
        
        for feature_index in feature_indices:
            feature_values = dataset[:, feature_index]
        
            sorted_indices = np.argsort(feature_values)
            sorted_dataset = dataset[sorted_indices]
            sorted_y = y[sorted_indices]

            prefix_sum = np.cumsum(sorted_y)
            prefix_sq_sum = np.cumsum(sorted_y ** 2)

            for i in range(1, num_samples):
                if feature_values[sorted_indices[i]] == feature_values[sorted_indices[i - 1]]:
                    continue 

                left_count = i
                right_count = num_samples - i
                left_sum = prefix_sum[i - 1]
                left_sq_sum = prefix_sq_sum[i - 1]
                right_sum = prefix_sum[-1] - left_sum
                right_sq_sum = prefix_sq_sum[-1] - left_sq_sum

                left_var = (left_sq_sum / left_count) - (left_sum / left_count) ** 2 if left_count > 0 else 0
                right_var = (right_sq_sum / right_count) - (right_sum / right_count) ** 2 if right_count > 0 else 0

                var_red = parent_variance - (
                    (left_count / total_samples) * left_var
                    + (right_count / total_samples) * right_var
                )

                if var_red > best_split["var_red"]:
                    best_split = {
                        "feature_index": feature_index,
                        "threshold": feature_values[sorted_indices[i]],
                        "dataset_left": sorted_dataset[:i],
                        "dataset_right": sorted_dataset[i:],
                        "var_red": var_red,
                        "left_variance": left_var,
                        "right_variance": right_var
                    }

        return best_split


    def calculate_leaf_value(self, Y):
        return np.mean(Y)
    
    def fit(self, X, Y):
        if len(X) == 0 or len(Y) == 0:
            raise ValueError("Input data cannot be empty.")
        dataset = np.concatenate((X, Y.reshape(-1, 1)), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        return [self.make_prediction(x, self.root) for x in X]


In [3]:
class GradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.05, max_depth=3, min_samples_split=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
        self.initial_prediction = None
        self.training_mse_history = [] 
        self.validation_mse_history = [] 

    def fit(self, X_train, y_train, X_val=None, y_val=None):
        self.initial_prediction = np.mean(y_train)
        y_pred_train = np.full_like(y_train, self.initial_prediction, dtype=np.float64)  
        y_pred_val = None

        if X_val is not None and y_val is not None:
            y_pred_val = np.full_like(y_val, self.initial_prediction, dtype=np.float64) 

        for i in range(self.n_estimators):
            residuals = y_train - y_pred_train
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_train, residuals)
            self.trees.append(tree)
            y_pred_train += self.learning_rate * np.array(tree.predict(X_train))

            train_mse = np.mean((y_train - y_pred_train) ** 2)
            self.training_mse_history.append(train_mse)

            val_mse = None
            if X_val is not None and y_val is not None:
                y_pred_val += self.learning_rate * np.array(tree.predict(X_val))
                val_mse = np.mean((y_val - y_pred_val) ** 2)
                self.validation_mse_history.append(val_mse)

            # Print progress
            if val_mse is not None:
                print(f"Tree {i+1}/{self.n_estimators} - Training MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")
            else:
                print(f"Tree {i+1}/{self.n_estimators} - Training MSE: {train_mse:.4f}")

    def predict(self, X):
        y_pred = np.full((X.shape[0],), self.initial_prediction, dtype=np.float64)
        for tree in self.trees:
            y_pred += self.learning_rate * np.array(tree.predict(X))
        return y_pred


In [None]:
train_path = 'SP_Train.xlsx'  
train_data = pd.read_excel(train_path)

train_data['Item_Weight'].fillna(train_data['Item_Weight'].mean(), inplace=True)
train_data.loc[
    (train_data['Outlet_Type'] == 'Grocery Store') & (train_data['Outlet_Size'].isna()),
    'Outlet_Size'
] = 'Small'

train_data.loc[
    (train_data['Outlet_Type'] == 'Supermarket Type1') & (train_data['Outlet_Size'].isna()),
    'Outlet_Size'
] = 'Small'

train_data.loc[
    (train_data['Outlet_Type'] == 'Supermarket Type2') & (train_data['Outlet_Size'].isna()),
    'Outlet_Size'
] = 'Medium'

train_data.loc[
    (train_data['Outlet_Type'] == 'Supermarket Type3') & (train_data['Outlet_Size'].isna()),
    'Outlet_Size'
] = 'Medium'

train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'})

train_data_encoded = pd.get_dummies(train_data, drop_first=True)

X = train_data_encoded.drop(columns=['Item_Outlet_Sales'])
y = np.array(train_data_encoded['Item_Outlet_Sales'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=1326, svd_solver='randomized', random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(X_pca.shape)

X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

def evaluate_model(n_trees, max_depth):
    rf = GradientBoostingRegressor(n_estimators=n_trees, max_depth=max_depth)
    rf.fit(X_train, y_train, X_val, y_val) 
    y_pred = rf.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    return mse

n_trees_list = [100]
max_depth_list = [3,7,10]

best_mse = float('inf')
best_params = {}

for n_trees in n_trees_list:
    for max_depth in max_depth_list:
        print(f"Evaluating: n_trees={n_trees}, max_depth={max_depth}")
        mse = evaluate_model(n_trees, max_depth)
        print(f"Mean Squared Error: {mse}")
        if mse < best_mse:
            best_mse = mse
            best_params = {'n_trees': n_trees, 'max_depth': max_depth}

print("\nBest Hyperparameters:")
print(f"n_trees: {best_params['n_trees']}")
print(f"max_depth: {best_params['max_depth']}")
print(f"Best Mean Squared Error: {best_mse}")
