In [19]:
from sklearn.svm import SVR
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, polynomial_kernel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
data = pd.read_csv('Dataset-1.csv')
X = data.drop(columns=['BWEIGHT'])  # Features
y = data['BWEIGHT']  # Target variable

In [21]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [22]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [23]:
X_processed = preprocessor.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [25]:
n_samples = 1000
X_train_s = X_train[:n_samples]
X_test_s = X_test[:n_samples]
y_train_s = y_train[:n_samples]
y_test_s = y_test[:n_samples]

In [26]:
w_lin, w_rbf, w_poly = 0.33, 0.33, 0.34  # Example weights, adjust as needed
gamma = "scale"
degree = 2
coef0 = 1.0

In [28]:
K_lin_tr = linear_kernel(X_train_s, X_train_s)
K_rbf_tr = rbf_kernel(X_train_s, X_train_s)  # Remove gamma=gamma here
K_pol_tr = polynomial_kernel(X_train_s, X_train_s, degree=degree, coef0=coef0)

K_train = w_lin * K_lin_tr + w_rbf * K_rbf_tr + w_poly * K_pol_tr

In [30]:
K_lin_te = linear_kernel(X_test_s, X_train_s)
K_rbf_te = rbf_kernel(X_test_s, X_train_s)
K_pol_te = polynomial_kernel(X_test_s, X_train_s, degree=degree, coef0=coef0)

K_test = w_lin * K_lin_te + w_rbf * K_rbf_te + w_poly * K_pol_te

In [31]:
svr = SVR(kernel="precomputed", C=10.0, epsilon=0.1)
svr.fit(K_train, y_train_s)

0,1,2
,kernel,'precomputed'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,10.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [34]:
y_pred = svr.predict(K_test)
mae = mean_absolute_error(y_test_s, y_pred)
rmse = mean_squared_error(y_test_s, y_pred)  # Corrected syntax
r2 = r2_score(y_test_s, y_pred)

In [35]:
# Print results
print("========= Results (Multi-Kernel SVR) =========")
print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")

MAE : 0.9159
RMSE: 1.4046
R^2 : 0.2146


In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, polynomial_kernel

# Load the dataset
data = pd.read_csv('Dataset-1.csv')
X = data.drop(columns=['BWEIGHT'])  # Features
y = data['BWEIGHT']  # Target variable

# Preprocessing for numerical and categorical data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Use a subset of the data to avoid MemoryError (e.g., 1000 samples)
n_samples = 1000  # Total samples for training + testing
n_test_samples = n_samples // 5  # Approximately 200 samples for testing (20% of 1000)
X_train_s = X_train[:n_samples - n_test_samples]  # 800 training samples
X_test_s = X_test[:n_test_samples]  # 200 test samples
y_train_s = y_train[:n_samples - n_test_samples].values  # 800 training labels
y_test_s = y_test[:n_test_samples].values  # 200 test labels

# Define weights and kernel parameters
w_lin, w_rbf, w_poly = 0.33, 0.33, 0.34  # Example weights, adjust as needed
gamma = "scale"  # Ensure gamma is defined
degree = 2
coef0 = 1.0

# Compute kernel matrices for training data
K_lin_tr = linear_kernel(X_train_s, X_train_s)
K_rbf_tr = rbf_kernel(X_train_s, X_train_s)  # No gamma for identical inputs
K_pol_tr = polynomial_kernel(X_train_s, X_train_s, degree=degree, coef0=coef0)
K_train = w_lin * K_lin_tr + w_rbf * K_rbf_tr + w_poly * K_pol_tr

# Compute kernel matrices for test data
K_lin_te = linear_kernel(X_test_s, X_train_s)
K_rbf_te = rbf_kernel(X_test_s, X_train_s)  # Keep gamma for different inputs
K_pol_te = polynomial_kernel(X_test_s, X_train_s, degree=degree, coef0=coef0)
K_test = w_lin * K_lin_te + w_rbf * K_rbf_te + w_poly * K_pol_te

# Convert y_train_s to a column vector
y_train_s = y_train_s.reshape(-1, 1)

# SVR Scratch Implementation
C = 10.0  # Regularization parameter
epsilon = 0.1  # Epsilon in epsilon-SVR

def compute_alpha(K, y, C, epsilon):
    n_samples = K.shape[0]
    # Initialize Lagrange multipliers
    alpha = np.zeros((n_samples, 1))
    alpha_star = np.zeros((n_samples, 1))
    
    # Objective function and constraints (simplified gradient descent)
    learning_rate = 0.01
    max_iter = 1000
    tol = 1e-4
    
    for _ in range(max_iter):
        # Compute gradients
        grad_alpha = K @ (alpha - alpha_star) - y + epsilon * np.ones((n_samples, 1))
        grad_alpha_star = -grad_alpha
        
        # Update rules with constraints (0 <= alpha, alpha_star <= C)
        alpha_new = np.maximum(0, np.minimum(C, alpha - learning_rate * grad_alpha))
        alpha_star_new = np.maximum(0, np.minimum(C, alpha_star - learning_rate * grad_alpha_star))
        
        # Check convergence
        if np.max(np.abs(alpha_new - alpha)) < tol and np.max(np.abs(alpha_star_new - alpha_star)) < tol:
            break
        
        alpha = alpha_new
        alpha_star = alpha_star_new
    
    return alpha, alpha_star

# Compute alpha and alpha_star
alpha, alpha_star = compute_alpha(K_train, y_train_s, C, epsilon)

# Compute bias (b) - average over support vectors
support_mask = (alpha > 0) & (alpha < C)  # Approximate support vectors
if np.sum(support_mask) > 0:
    b = np.mean(y_train_s[support_mask] - (K_train[support_mask] @ (alpha - alpha_star))[support_mask] + epsilon)
else:
    b = 0.0  # Default if no support vectors

# Prediction function
def predict(K_test, alpha, alpha_star, b):
    return (K_test @ (alpha - alpha_star)).flatten() + b

# Make predictions
y_pred = predict(K_test, alpha, alpha_star, b)

# Evaluate the model
mae = np.mean(np.abs(y_test_s - y_pred))
rmse = np.sqrt(np.mean((y_test_s - y_pred) ** 2))
r2 = 1 - np.sum((y_test_s - y_pred) ** 2) / np.sum((y_test_s - np.mean(y_test_s)) ** 2)

# Print results
print("========= Results (Scratch Multi-Kernel SVR) =========")
print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2 : {r2:.4f}")

MAE : 26520.8482
RMSE: 27916.6265
R^2 : -504988490.7566


In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, polynomial_kernel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ============================================
# 1️⃣ Load and preprocess data
# ============================================

# Load dataset
data = pd.read_csv('Dataset-1.csv')

# Split features and target
X = data.drop(columns=['BWEIGHT'])
y = data['BWEIGHT']

# Identify column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipelines for numerical and categorical features
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

# Combine transformers
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Preprocess data
X_processed = preprocessor.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Use a manageable subset (avoid memory issues)
n_samples = min(2000, len(X_train) + len(X_test))
n_test_samples = n_samples // 5
X_train_s = X_train[:n_samples - n_test_samples]
X_test_s = X_test[:n_test_samples]
y_train_s = y_train[:n_samples - n_test_samples].values.reshape(-1, 1)
y_test_s = y_test[:n_test_samples].values.reshape(-1, 1)

# ============================================
# 2️⃣ Multi-Kernel Construction
# ============================================

w_lin, w_rbf, w_poly = 0.3, 0.4, 0.3   # Tunable kernel weights
gamma = 'scale'
degree = 3
coef0 = 1.0

# Kernel matrices (training and test)
K_lin_tr = linear_kernel(X_train_s, X_train_s)
K_rbf_tr = rbf_kernel(X_train_s, X_train_s)
K_pol_tr = polynomial_kernel(X_train_s, X_train_s, degree=degree, coef0=coef0)

K_train = w_lin*K_lin_tr + w_rbf*K_rbf_tr + w_poly*K_pol_tr

K_lin_te = linear_kernel(X_test_s, X_train_s)
K_rbf_te = rbf_kernel(X_test_s, X_train_s)
K_pol_te = polynomial_kernel(X_test_s, X_train_s, degree=degree, coef0=coef0)

K_test = w_lin*K_lin_te + w_rbf*K_rbf_te + w_poly*K_pol_te

# ============================================
# 3️⃣ Simplified SVR Optimization (Scratch)
# ============================================

def compute_alpha(K, y, C=10.0, epsilon=0.1, lr=0.001, max_iter=2000, tol=1e-5):
    n_samples = K.shape[0]
    alpha = np.zeros((n_samples, 1))
    alpha_star = np.zeros((n_samples, 1))

    for _ in range(max_iter):
        grad = K @ (alpha - alpha_star) - y + epsilon
        alpha_new = np.clip(alpha - lr * grad, 0, C)
        alpha_star_new = np.clip(alpha_star + lr * grad, 0, C)

        # Convergence check
        if np.max(np.abs(alpha_new - alpha)) < tol:
            break

        alpha, alpha_star = alpha_new, alpha_star_new

    return alpha, alpha_star

# Train
alpha, alpha_star = compute_alpha(K_train, y_train_s)

# Compute bias (b)
diff = alpha - alpha_star
support_mask = (diff.flatten() != 0)
if np.sum(support_mask) > 0:
    b = np.mean(y_train_s[support_mask] - (K_train @ diff)[support_mask])
else:
    b = 0.0

# ============================================
# 4️⃣ Prediction and Evaluation
# ============================================

def predict(K_test, alpha, alpha_star, b):
    return (K_test @ (alpha - alpha_star)).flatten() + b

y_pred = predict(K_test, alpha, alpha_star, b)

# Metrics
mae = mean_absolute_error(y_test_s, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_s, y_pred))
r2 = r2_score(y_test_s, y_pred)

# ============================================
# 5️⃣ Print Results
# ============================================

print("========= Results (Enhanced Multi-Kernel SVR) =========")
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")
print(f"Bias : {b:.4f}")
print("=======================================================")


MAE  : 7817.0529
RMSE : 16710.4887
R²   : -159015448.5840
Bias : 50659.2318


In [45]:
# ===============================================================
# Multi-Kernel SVR (Memory-Efficient Version)
# ===============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, polynomial_kernel
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ===============================================================
# 1️⃣ Load and preprocess dataset
# ===============================================================
data = pd.read_csv('Dataset-1.csv')

X = data.drop(columns=['BWEIGHT'])
y = data['BWEIGHT'].values

# Separate numerical and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

print(f"Dataset shape: {X_processed.shape}")
print(f"Memory estimate for kernel matrix: {(X_processed.shape[0]**2 * 8) / (1024**3):.2f} GB")

# ===============================================================
# 2️⃣ Split data and scale target
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# SOLUTION 1: Subsample if dataset is too large
MAX_TRAIN_SIZE = 5000  # Adjust based on your RAM
if X_train.shape[0] > MAX_TRAIN_SIZE:
    print(f"\n⚠️ Training set too large ({X_train.shape[0]} samples)")
    print(f"   Subsampling to {MAX_TRAIN_SIZE} samples for memory efficiency")
    indices = np.random.choice(X_train.shape[0], MAX_TRAIN_SIZE, replace=False)
    X_train = X_train[indices]
    y_train = y_train[indices]

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Scale target
y_scaler = StandardScaler()
y_train_s = y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_s = y_scaler.transform(y_test.reshape(-1, 1)).flatten()

# ===============================================================
# 3️⃣ Define Multi-Kernel setup (MEMORY EFFICIENT)
# ===============================================================
w_lin, w_rbf, w_poly = 0.2, 0.6, 0.2
degree = 3
coef0 = 1.0

n_features = X_train.shape[1]
gamma = 1.0 / (n_features * np.var(X_train))

print(f"\nKernel parameters:")
print(f"  Weights: lin={w_lin}, rbf={w_rbf}, poly={w_poly}")
print(f"  Gamma: {gamma:.6f}")
print(f"  Polynomial degree: {degree}")

# Compute combined kernels in float32 to save memory
print("\nComputing kernel matrices...")

K_lin_tr = linear_kernel(X_train, X_train).astype(np.float32)
K_rbf_tr = rbf_kernel(X_train, X_train, gamma=gamma).astype(np.float32)
K_pol_tr = polynomial_kernel(X_train, X_train, degree=degree, coef0=coef0).astype(np.float32)
K_train = w_lin * K_lin_tr + w_rbf * K_rbf_tr + w_poly * K_pol_tr
del K_lin_tr, K_rbf_tr, K_pol_tr  # Free memory

K_lin_te = linear_kernel(X_test, X_train).astype(np.float32)
K_rbf_te = rbf_kernel(X_test, X_train, gamma=gamma).astype(np.float32)
K_pol_te = polynomial_kernel(X_test, X_train, degree=degree, coef0=coef0).astype(np.float32)
K_test = w_lin * K_lin_te + w_rbf * K_rbf_te + w_poly * K_pol_te
del K_lin_te, K_rbf_te, K_pol_te  # Free memory

print("✓ Kernel matrices computed successfully")

# ===============================================================
# 4️⃣ Scratch SVR Optimizer
# ===============================================================
C = 10.0
epsilon = 0.1
learning_rate = 0.005
max_iter = 5000
tol = 1e-4

def compute_alpha(K, y, C, epsilon, lr, max_iter, tol):
    n_samples = K.shape[0]
    alpha = np.zeros((n_samples, 1), dtype=np.float32)
    alpha_star = np.zeros((n_samples, 1), dtype=np.float32)
    y = y.reshape(-1, 1).astype(np.float32)
    
    print("\nTraining SVR...")
    for iteration in range(max_iter):
        grad = K @ (alpha - alpha_star) - y
        grad_alpha = grad + epsilon
        grad_alpha_star = -grad + epsilon
        
        alpha_new = np.clip(alpha - lr * grad_alpha, 0, C)
        alpha_star_new = np.clip(alpha_star - lr * grad_alpha_star, 0, C)
        
        diff = max(np.max(np.abs(alpha_new - alpha)), 
                   np.max(np.abs(alpha_star_new - alpha_star)))
        
        if diff < tol:
            print(f"✓ Converged at iteration {iteration + 1}")
            break
        
        alpha, alpha_star = alpha_new, alpha_star_new
        
        if (iteration + 1) % 500 == 0:
            print(f"  Iteration {iteration + 1}/{max_iter}, diff: {diff:.6f}")
    
    return alpha, alpha_star

alpha, alpha_star = compute_alpha(K_train, y_train_s, C, epsilon, learning_rate, max_iter, tol)

# ===============================================================
# 5️⃣ Compute bias
# ===============================================================
support_mask = (alpha.flatten() > 1e-5) & (alpha.flatten() < C - 1e-5)
n_support = np.sum(support_mask)

if n_support > 0:
    b = np.mean(y_train_s[support_mask] - 
                (K_train[support_mask] @ (alpha - alpha_star)).flatten()[support_mask])
    print(f"\nSupport vectors: {n_support}/{len(y_train_s)}")
else:
    b = 0.0
    print("\n⚠️ No support vectors found, using b=0")

# ===============================================================
# 6️⃣ Predict and evaluate
# ===============================================================
def predict(K_test, alpha, alpha_star, b):
    return (K_test @ (alpha - alpha_star)).flatten() + b

y_pred_s = predict(K_test, alpha, alpha_star, b)
y_pred = y_scaler.inverse_transform(y_pred_s.reshape(-1, 1)).flatten()

# ===============================================================
# 7️⃣ Performance evaluation
# ===============================================================
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n" + "="*60)
print("   RESULTS: Multi-Kernel SVR (Scratch Implementation)")
print("="*60)
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")
print(f"Bias : {b:.4f}")
print("="*60)

# ===============================================================
# 8️⃣ Compare with sklearn's SVR
# ===============================================================
from sklearn.svm import SVR

print("\nTraining sklearn SVR for comparison...")
svr = SVR(kernel='rbf', C=10, epsilon=0.1, gamma=gamma)
svr.fit(X_train, y_train_s)
y_pred_sklearn = svr.predict(X_test)
y_pred_sklearn_inv = y_scaler.inverse_transform(y_pred_sklearn.reshape(-1, 1)).flatten()

print("\n" + "="*60)
print("   RESULTS: Sklearn SVR (Baseline)")
print("="*60)
print(f"MAE  : {mean_absolute_error(y_test, y_pred_sklearn_inv):.4f}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test, y_pred_sklearn_inv)):.4f}")
print(f"R²   : {r2_score(y_test, y_pred_sklearn_inv):.4f}")
print(f"Support vectors: {len(svr.support_)}/{len(y_train_s)}")
print("="*60)

# ===============================================================
# 9️⃣ Sample predictions
# ===============================================================
print("\n" + "="*60)
print("   Sample Predictions (First 10)")
print("="*60)
print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12}")
print("-" * 60)
for i in range(min(10, len(y_test))):
    error = abs(y_test[i] - y_pred[i])
    print(f"{y_test[i]:<12.2f} {y_pred[i]:<12.2f} {error:<12.2f}")
print("="*60)

Dataset shape: (101400, 36)
Memory estimate for kernel matrix: 76.61 GB

⚠️ Training set too large (81120 samples)
   Subsampling to 5000 samples for memory efficiency
Training set size: 5000 samples
Test set size: 20280 samples

Kernel parameters:
  Weights: lin=0.2, rbf=0.6, poly=0.2
  Gamma: 0.024322
  Polynomial degree: 3

Computing kernel matrices...
✓ Kernel matrices computed successfully

Training SVR...
  Iteration 500/5000, diff: 10.000000
  Iteration 1000/5000, diff: 10.000000
  Iteration 1500/5000, diff: 10.000000
  Iteration 2000/5000, diff: 10.000000
  Iteration 2500/5000, diff: 10.000000
  Iteration 3000/5000, diff: 10.000000
  Iteration 3500/5000, diff: 10.000000
  Iteration 4000/5000, diff: 10.000000
  Iteration 4500/5000, diff: 10.000000
  Iteration 5000/5000, diff: 10.000000

⚠️ No support vectors found, using b=0

   RESULTS: Multi-Kernel SVR (Scratch Implementation)
MAE  : 149984.5236
RMSE : 156959.0869
R²   : -13851782939.2534
Bias : 0.0000

Training sklearn SVR fo

In [None]:
# ===============================================================
# Decision Tree Regressor (From Scratch)
# ===============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ===============================================================
# 1️⃣ Decision Tree Node Class
# ===============================================================
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature      # Feature index for splitting
        self.threshold = threshold  # Threshold value for splitting
        self.left = left           # Left child node
        self.right = right         # Right child node
        self.value = value         # Prediction value (for leaf nodes)

# ===============================================================
# 2️⃣ Decision Tree Regressor Class
# ===============================================================
class DecisionTreeRegressor:
    def __init__(self, max_depth=10, min_samples_split=2, min_samples_leaf=1, criterion='mse'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.criterion = criterion
        self.root = None
        self.n_features = None
        
    def fit(self, X, y):
        """Train the decision tree"""
        self.n_features = X.shape[1]
        self.root = self._grow_tree(X, y, depth=0)
        return self
    
    def _calculate_mse(self, y):
        """Calculate mean squared error"""
        if len(y) == 0:
            return 0
        mean = np.mean(y)
        return np.mean((y - mean) ** 2)
    
    def _calculate_mae(self, y):
        """Calculate mean absolute error"""
        if len(y) == 0:
            return 0
        median = np.median(y)
        return np.mean(np.abs(y - median))
    
    def _calculate_criterion(self, y):
        """Calculate splitting criterion"""
        if self.criterion == 'mse':
            return self._calculate_mse(y)
        elif self.criterion == 'mae':
            return self._calculate_mae(y)
        else:
            return self._calculate_mse(y)
    
    def _best_split(self, X, y):
        """Find the best split for a node"""
        best_gain = -float('inf')
        best_feature = None
        best_threshold = None
        
        parent_criterion = self._calculate_criterion(y)
        n_samples = len(y)
        
        # Try each feature
        for feature in range(self.n_features):
            thresholds = np.unique(X[:, feature])
            
            # Try each unique value as threshold
            for threshold in thresholds:
                # Split data
                left_mask = X[:, feature] <= threshold
                right_mask = ~left_mask
                
                # Check minimum samples constraint
                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue
                
                # Calculate weighted criterion for children
                y_left, y_right = y[left_mask], y[right_mask]
                n_left, n_right = len(y_left), len(y_right)
                
                left_criterion = self._calculate_criterion(y_left)
                right_criterion = self._calculate_criterion(y_right)
                
                weighted_criterion = (n_left / n_samples) * left_criterion + (n_right / n_samples) * right_criterion
                
                # Calculate information gain
                gain = parent_criterion - weighted_criterion
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        
        return best_feature, best_threshold, best_gain
    
    def _grow_tree(self, X, y, depth):
        """Recursively grow the decision tree"""
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        # Stopping criteria
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or 
            n_labels == 1):
            leaf_value = np.mean(y)
            return Node(value=leaf_value)
        
        # Find best split
        best_feature, best_threshold, best_gain = self._best_split(X, y)
        
        # If no valid split found, create leaf
        if best_feature is None:
            leaf_value = np.mean(y)
            return Node(value=leaf_value)
        
        # Split data
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = ~left_mask
        
        # Recursively build left and right subtrees
        left_child = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_child = self._grow_tree(X[right_mask], y[right_mask], depth + 1)
        
        return Node(best_feature, best_threshold, left_child, right_child)
    
    def _predict_sample(self, x, node):
        """Predict single sample"""
        if node.value is not None:
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)
    
    def predict(self, X):
        """Predict for multiple samples"""
        return np.array([self._predict_sample(x, self.root) for x in X])
    
    def get_depth(self, node=None):
        """Get the depth of the tree"""
        if node is None:
            node = self.root
        
        if node.value is not None:
            return 0
        
        left_depth = self.get_depth(node.left) if node.left else 0
        right_depth = self.get_depth(node.right) if node.right else 0
        
        return 1 + max(left_depth, right_depth)
    
    def count_leaves(self, node=None):
        """Count the number of leaf nodes"""
        if node is None:
            node = self.root
        
        if node.value is not None:
            return 1
        
        left_leaves = self.count_leaves(node.left) if node.left else 0
        right_leaves = self.count_leaves(node.right) if node.right else 0
        
        return left_leaves + right_leaves

# ===============================================================
# 3️⃣ Load and preprocess dataset
# ===============================================================
data = pd.read_csv('Dataset-1.csv')

X = data.drop(columns=['BWEIGHT'])
y = data['BWEIGHT'].values

# Separate numerical and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Pipelines for preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

print(f"Dataset shape: {X_processed.shape}")

# ===============================================================
# 4️⃣ Split data
# ===============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# ===============================================================
# 5️⃣ Train Decision Tree from Scratch
# ===============================================================
print("\n" + "="*60)
print("   Training Decision Tree Regressor (From Scratch)")
print("="*60)

# Initialize and train the tree
dt = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    criterion='mse'
)

print("\nTraining...")
dt.fit(X_train, y_train)

print(f"✓ Training complete!")
print(f"  Tree depth: {dt.get_depth()}")
print(f"  Number of leaves: {dt.count_leaves()}")

# ===============================================================
# 6️⃣ Make predictions
# ===============================================================
print("\nMaking predictions...")
y_pred = dt.predict(X_test)

# ===============================================================
# 7️⃣ Performance evaluation
# ===============================================================
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n" + "="*60)
print("   RESULTS: Decision Tree Regressor (Scratch)")
print("="*60)
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")
print("="*60)

# ===============================================================
# 8️⃣ Compare with sklearn's Decision Tree
# ===============================================================
from sklearn.tree import DecisionTreeRegressor as SklearnDT

print("\n" + "="*60)
print("   Training sklearn Decision Tree for comparison")
print("="*60)

sklearn_dt = SklearnDT(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

sklearn_dt.fit(X_train, y_train)
y_pred_sklearn = sklearn_dt.predict(X_test)

print(f"✓ Training complete!")
print(f"  Tree depth: {sklearn_dt.get_depth()}")
print(f"  Number of leaves: {sklearn_dt.get_n_leaves()}")

mae_sklearn = mean_absolute_error(y_test, y_pred_sklearn)
rmse_sklearn = np.sqrt(mean_squared_error(y_test, y_pred_sklearn))
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print("\n" + "="*60)
print("   RESULTS: sklearn Decision Tree (Baseline)")
print("="*60)
print(f"MAE  : {mae_sklearn:.4f}")
print(f"RMSE : {rmse_sklearn:.4f}")
print(f"R²   : {r2_sklearn:.4f}")
print("="*60)

# ===============================================================
# 9️⃣ Sample predictions
# ===============================================================
print("\n" + "="*60)
print("   Sample Predictions (First 10)")
print("="*60)
print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12}")
print("-" * 60)
for i in range(min(10, len(y_test))):
    error = abs(y_test[i] - y_pred[i])
    print(f"{y_test[i]:<12.2f} {y_pred[i]:<12.2f} {error:<12.2f}")
print("="*60)

# ===============================================================
# 🔟 Feature Importance (using sklearn model)
# ===============================================================
print("\n" + "="*60)
print("   Top 10 Most Important Features")
print("="*60)

feature_importance = sklearn_dt.feature_importances_
feature_names = list(numeric_features) + list(categorical_features)

# Get top 10 features
top_indices = np.argsort(feature_importance)[-10:][::-1]
print(f"{'Rank':<6} {'Feature':<30} {'Importance':<12}")
print("-" * 60)
for rank, idx in enumerate(top_indices, 1):
    print(f"{rank:<6} {feature_names[idx]:<30} {feature_importance[idx]:.6f}")
print("="*60)

Dataset shape: (101400, 36)
Training set size: 81120 samples
Test set size: 20280 samples

   Training Decision Tree Regressor (From Scratch)

Training...
