In [13]:
import pandas as pd
import numpy as np
from math import log

# Load dataset
wine_data = pd.read_csv('winedata.csv', delimiter=';')
X = wine_data.drop(columns='quality').values
y = wine_data['quality'].values

In [14]:
# Define Linear Regression model
class CustomLinearRegression:
    def __init__(self):
        self.coef_ = None
        self.intercept_ = 0

    def fit(self, X, y):
        # Adding bias term for intercept
        X_bias = np.c_[np.ones(X.shape[0]), X]
        # Normal equation: (X.T * X)^(-1) * X.T * y
        XtX_inv = np.linalg.inv(X_bias.T.dot(X_bias))
        self.coef_ = XtX_inv.dot(X_bias.T).dot(y)
        self.intercept_ = self.coef_[0]
        self.coef_ = self.coef_[1:]

    def predict(self, X):
        return X.dot(self.coef_) + self.intercept_

# Define k-Fold Cross-Validation function
def custom_k_fold_cross_validation(X, y, model, k=5):
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // k
    mse_scores = []

    for i in range(k):
        val_indices = indices[i * fold_size : (i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, val_indices)

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse_scores.append(np.mean((y_val - y_pred) ** 2))

    return np.mean(mse_scores), np.std(mse_scores)

# Define Bootstrapping function
def custom_bootstrap_model_selection(X, y, model, n_iterations=100):
    mse_scores = []
    np.random.seed(42)

    for _ in range(n_iterations):
        # Bootstrap resample
        resample_indices = np.random.choice(range(len(X)), size=len(X), replace=True)
        X_resample, y_resample = X[resample_indices], y[resample_indices]

        model.fit(X_resample, y_resample)
        y_pred = model.predict(X)
        mse_scores.append(np.mean((y - y_pred) ** 2))

    return np.mean(mse_scores), np.std(mse_scores)

# Define AIC calculation function
def calculate_aic(X, y, model):
    model.fit(X, y)
    y_pred = model.predict(X)
    mse = np.mean((y - y_pred) ** 2)
    n = len(y)
    k = X.shape[1]
    aic = n * log(mse) + 2 * k
    return aic



In [15]:
import numpy as np

# Custom Evaluation metrics for classification
def calculate_accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    return correct / len(y_true)

def calculate_precision(y_true, y_pred):
    true_positive = np.sum((y_pred == 1) & (y_true == 1))
    predicted_positive = np.sum(y_pred == 1)
    return true_positive / predicted_positive if predicted_positive != 0 else 0

def calculate_recall(y_true, y_pred):
    true_positive = np.sum((y_pred == 1) & (y_true == 1))
    actual_positive = np.sum(y_true == 1)
    return true_positive / actual_positive if actual_positive != 0 else 0

def calculate_f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

In [16]:
# Evaluation metrics for regression
def evaluate_metrics(y_true, y_pred, task_type='regression'):
    if task_type == 'regression':
        # Regression metrics
        mse = np.mean((y_true - y_pred) ** 2)
        rmse = np.sqrt(mse)
        mae = np.mean(np.abs(y_true - y_pred))
        r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
        
        print("Regression Metrics:")
        print("Mean Squared Error (MSE):", mse)
        print("Root Mean Squared Error (RMSE):", rmse)
        print("Mean Absolute Error (MAE):", mae)
        print("R^2 Score:", r2)
        return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}
    

    elif task_type == 'classification':
        # Classification metrics
        accuracy = calculate_accuracy(y_true, y_pred)
        precision = calculate_precision(y_true, y_pred)
        recall = calculate_recall(y_true, y_pred)
        f1 = calculate_f1_score(precision, recall)

        print("\nClassification Metrics:")
        print("Accuracy:", accuracy)
        print("Precision (Weighted):", precision)
        print("Recall (Weighted):", recall)
        print("F1 Score (Weighted):", f1)
        
        metrics = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
        
        return metrics

In [17]:
# Initialize and evaluate a regression model
linear_model = CustomLinearRegression()

# k-Fold Cross-Validation for Regression
k_fold_mean_mse, k_fold_std_mse = custom_k_fold_cross_validation(X, y, linear_model, k=5)
print("k-Fold Cross-Validation Mean MSE:", k_fold_mean_mse)
print("k-Fold Cross-Validation Std MSE:", k_fold_std_mse)

# Bootstrapping for Regression
bootstrap_mean_mse, bootstrap_std_mse = custom_bootstrap_model_selection(X, y, linear_model, n_iterations=100)
print("Bootstrapping Mean MSE:", bootstrap_mean_mse)
print("Bootstrapping Std MSE:", bootstrap_std_mse)

# Regression metrics on entire dataset
linear_model.fit(X, y)
y_pred = linear_model.predict(X)
evaluate_metrics(y, y_pred, task_type='regression')


k-Fold Cross-Validation Mean MSE: 0.5675070238740973
k-Fold Cross-Validation Std MSE: 0.024211861662026154
Bootstrapping Mean MSE: 0.5652007623900872
Bootstrapping Std MSE: 0.0009890211204771238
Regression Metrics:
Mean Squared Error (MSE): 0.5631540629886554
Root Mean Squared Error (RMSE): 0.7504359153109981
Mean Absolute Error (MAE): 0.5836349500197301
R^2 Score: 0.28187036413328725


{'MSE': 0.5631540629886554,
 'RMSE': 0.7504359153109981,
 'MAE': 0.5836349500197301,
 'R2': 0.28187036413328725}

In [18]:
import numpy as np
from collections import Counter

# Custom K-Fold Cross-Validation Implementation
def custom_k_fold_split(X, y, k=5, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // k
    folds = []

    for i in range(k):
        val_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, val_indices)
        folds.append((train_indices, val_indices))
    
    return folds

# Custom Decision Tree (simple version, as Random Forests require a set of trees)
class CustomDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict_row(row, self.tree) for row in X])

    def _build_tree(self, X, y, depth):
        # Base cases for stopping
        if len(set(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return Counter(y).most_common(1)[0][0]

        # Random feature selection for split (simplified for demonstration)
        feature_idx = np.random.choice(X.shape[1])
        median = np.median(X[:, feature_idx])

        left_idx = X[:, feature_idx] <= median
        right_idx = X[:, feature_idx] > median

        if sum(left_idx) == 0 or sum(right_idx) == 0:  # Prevent split if one side is empty
            return Counter(y).most_common(1)[0][0]

        return {
            "feature": feature_idx,
            "threshold": median,
            "left": self._build_tree(X[left_idx], y[left_idx], depth + 1),
            "right": self._build_tree(X[right_idx], y[right_idx], depth + 1)
        }

    def _predict_row(self, row, tree):
        if not isinstance(tree, dict):  # If the tree is a leaf node
            return tree
        feature, threshold = tree["feature"], tree["threshold"]
        if row[feature] <= threshold:
            return self._predict_row(row, tree["left"])
        else:
            return self._predict_row(row, tree["right"])


In [19]:
# Custom Random Forest Classifier
class CustomRandomForest:
    def __init__(self, n_estimators=10, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(range(len(X)), size=len(X), replace=True)
            X_sample, y_sample = X[indices], y[indices]
            tree = CustomDecisionTree(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Collect predictions from each tree
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority voting
        majority_votes = [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return np.array(majority_votes)

# Testing with custom K-Fold and Random Forest on provided data
def custom_random_forest_k_fold(X, y, n_splits=5, n_estimators=10, max_depth=None):
    folds = custom_k_fold_split(X, y, k=n_splits)
    y_true_all, y_pred_all = [], []

    for train_idx, val_idx in folds:
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Initialize and train Custom Random Forest
        clf = CustomRandomForest(n_estimators=n_estimators, max_depth=max_depth)
        clf.fit(X_train, y_train)
        
        # Make predictions and accumulate results
        y_pred = clf.predict(X_val)
        y_true_all.extend(y_val)
        y_pred_all.extend(y_pred)

    return np.array(y_true_all), np.array(y_pred_all)


In [21]:
y_class = (y >= 6).astype(int)
# Convert data to numpy arrays if necessary
X = np.array(X)
y_class = np.array(y_class)

# Run custom K-Fold with Custom Random Forest
y_true_all, y_pred_all = custom_random_forest_k_fold(X, y_class, n_splits=10, n_estimators=10, max_depth=5)

evaluate_metrics(y_true_all, y_pred_all, task_type='classification')


Classification Metrics:
Accuracy: 0.7120654396728017
Precision (Weighted): 0.715219421101774
Recall (Weighted): 0.942189421894219
F1 Score (Weighted): 0.8131634819532909


{'Accuracy': 0.7120654396728017,
 'Precision': 0.715219421101774,
 'Recall': 0.942189421894219,
 'F1 Score': 0.8131634819532909}

#### Testing our custom random forest k fold with other dataset


In [22]:
file_path = "iris.csv"
iris_data = pd.read_csv(file_path)

iris_data.head

<bound method NDFrame.head of      sepal_length  sepal_width  petal_length  petal_width    species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]>

In [23]:
X_newdataset = iris_data.drop(columns='species').values
y_newdataset = iris_data['species'].values


y_true_all, y_pred_all = custom_random_forest_k_fold(X_newdataset, y_newdataset, n_splits=3, n_estimators=10, max_depth=5)

print("Accuracy of new model using our random forest k fold\n")
calculate_accuracy(y_true_all, y_pred_all)

Accuracy of new model using our random forest k fold



0.9333333333333333

#### Testing our custom linear regression k fold with other dataset


In [24]:
file_path = "housing.csv"
housing_data = pd.read_csv(file_path)

housing_data.head

<bound method NDFrame.head of         RM  LSTAT  PTRATIO      MEDV
0    6.575   4.98     15.3  504000.0
1    6.421   9.14     17.8  453600.0
2    7.185   4.03     17.8  728700.0
3    6.998   2.94     18.7  701400.0
4    7.147   5.33     18.7  760200.0
..     ...    ...      ...       ...
484  6.593   9.67     21.0  470400.0
485  6.120   9.08     21.0  432600.0
486  6.976   5.64     21.0  501900.0
487  6.794   6.48     21.0  462000.0
488  6.030   7.88     21.0  249900.0

[489 rows x 4 columns]>

In [25]:
X_housing = housing_data.drop(columns='MEDV').values
y_housing = housing_data['MEDV'].values

linear_model.fit(X_housing, y_housing)
y_pred_housing = linear_model.predict(X_housing)
evaluate_metrics(y_housing, y_pred_housing, task_type='regression')

Regression Metrics:
Mean Squared Error (MSE): 7703545538.8708105
Root Mean Squared Error (RMSE): 87769.84413151712
Mean Absolute Error (MAE): 65458.43964023349
R^2 Score: 0.7176275212982739


{'MSE': 7703545538.8708105,
 'RMSE': 87769.84413151712,
 'MAE': 65458.43964023349,
 'R2': 0.7176275212982739}