In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [6]:
df = pd.read_csv('/content/final_data_with_target.csv')
df.head()

Unnamed: 0,smoking,fasting blood sugar,systolic,ALT,waist(cm),LDL,Cholesterol,height(cm),AST,age,LDL_Cholesterol_interaction
0,1,0.030986,1.043324,-0.319201,-0.187355,-1.05631,-1.143588,-0.350112,-0.158976,0.47835,1.152845
1,0,2.190491,1.587198,-0.145223,1.701348,0.076314,0.055946,-0.464412,-0.213069,2.424556,0.002107
2,1,-1.273776,-0.336593,0.284668,-0.498541,-0.656059,-0.686137,0.917431,0.189659,-1.784261,0.483355
3,0,0.442923,0.905007,0.193462,1.550728,-0.385095,-0.471029,1.924878,-0.434846,-1.094118,0.229674
4,1,-0.5483,-0.116913,-0.632779,-0.566083,-1.050942,-1.141277,0.421247,-0.735424,-0.845604,1.122349


In [7]:
X = df.drop("smoking", axis=1)
y = df['smoking']
X.head()

Unnamed: 0,fasting blood sugar,systolic,ALT,waist(cm),LDL,Cholesterol,height(cm),AST,age,LDL_Cholesterol_interaction
0,0.030986,1.043324,-0.319201,-0.187355,-1.05631,-1.143588,-0.350112,-0.158976,0.47835,1.152845
1,2.190491,1.587198,-0.145223,1.701348,0.076314,0.055946,-0.464412,-0.213069,2.424556,0.002107
2,-1.273776,-0.336593,0.284668,-0.498541,-0.656059,-0.686137,0.917431,0.189659,-1.784261,0.483355
3,0.442923,0.905007,0.193462,1.550728,-0.385095,-0.471029,1.924878,-0.434846,-1.094118,0.229674
4,-0.5483,-0.116913,-0.632779,-0.566083,-1.050942,-1.141277,0.421247,-0.735424,-0.845604,1.122349


In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [9]:
X_train.head()

Unnamed: 0,fasting blood sugar,systolic,ALT,waist(cm),LDL,Cholesterol,height(cm),AST,age,LDL_Cholesterol_interaction
73792,-0.481194,-0.836532,-1.372954,-1.443513,0.795673,0.954629,-1.159852,-1.245986,0.406783,0.672331
5657,-0.498343,-1.380191,-0.43539,-0.478783,1.042256,1.131609,0.272222,-0.661766,-0.861685,1.277479
154300,-0.212797,-0.291774,1.457645,0.758284,-0.287905,-0.320558,0.910114,1.43359,-0.809643,-0.271931
78825,-1.13925,-0.646015,-0.710224,-0.831676,1.034067,1.159918,0.385057,-0.966703,-1.375983,1.290551
51389,0.113724,-0.466697,-1.001626,-0.011323,-0.873875,-0.964815,0.881697,-1.394758,-0.781417,0.576664


In [10]:
y_train.head()

73792     0
5657      1
154300    1
78825     0
51389     0
Name: smoking, dtype: int64

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

(110497, 10)
(110497,)
(23678, 10)
(23678,)
(23678, 10)
(23678,)


<h1>Bagging

functions to create bootstrap(different data samples with repetitions), train on bootstrap and predict decision trees, function to implement bagging with decision tree letting them vote and grid search and using grid search find max depth and train multiple decision tree with best max depth. lastly evaulate performance of bagged ensemble on validation set


In [None]:

def create_bootstrap_sample(X, y):
    indices = np.random.choice(len(X), len(X), replace=True)
    X_sampled, y_sampled = X.iloc[indices], y.iloc[indices]
    X_sampled.reset_index(drop=True, inplace=True)
    y_sampled.reset_index(drop=True, inplace=True)
    return X_sampled, y_sampled


def train_decision_tree(X, y, max_depth=None):
    tree = DecisionTreeClassifier(max_depth=max_depth)
    tree.fit(X, y)
    return tree


def predict_decision_tree(tree, X):
    return tree.predict(X)


def bagging_with_decision_trees(X_train, y_train, X_val, y_val, num_trees):
    max_depth_values = [None, 5, 10, 15, 20]
    param_grid = {'max_depth': max_depth_values}
    trees = []

    grid_search = GridSearchCV(estimator=DecisionTreeClassifier(),
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=5)

    grid_search.fit(X_train, y_train)

    best_max_depth = grid_search.best_params_['max_depth']

    for _ in range(num_trees):
        X_bootstrap, y_bootstrap = create_bootstrap_sample(X_train, y_train)

        tree = train_decision_tree(X_bootstrap, y_bootstrap, max_depth=best_max_depth)

        trees.append(tree)

    predictions = np.array([predict_decision_tree(tree, X_val) for tree in trees])

    final_prediction = np.mean(predictions, axis=0).round()

    accuracy = accuracy_score(y_val, final_prediction)
    print(f"Bagged Ensemble Accuracy: {accuracy}")
    print(f"Best max_depth from grid search: {best_max_depth}")

np.random.seed(500)
num_trees = 50

bagging_with_decision_trees(X_train, y_train, X_val, y_val, num_trees)

evaluating bagged ensemble on test data

In [None]:
def evaluate_on_test_set_bagging(ensemble, X_test, y_test):
    predictions = np.array([predict_decision_tree(tree, X_test) for tree in ensemble])

    final_prediction = np.mean(predictions, axis=0).round()
    accuracy = accuracy_score(y_test, final_prediction)
    print(f"Accuracy on the Test Set: {accuracy}")
    a=accuracy
best_max_depth =  5
ensemble_for_test = []
for _ in range(num_trees):
    X_bootstrap, y_bootstrap = create_bootstrap_sample(X_train, y_train)
    tree = train_decision_tree(X_bootstrap, y_bootstrap, max_depth=best_max_depth)
    ensemble_for_test.append(tree)

evaluate_on_test_set_bagging(ensemble_for_test, X_test, y_test)

functions to create bootstrap, train on bootstrap with random feature selection and predict decision trees, function to implement bagging with decision tree letting them vote and grid search and using grid search find max depth and max subset of features to split on and find the best pair and train multiple decision tree with best pair. lastly evaulate performance of bagged ensemble on validation set

<h1>Random Forest

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def create_bootstrap_sample(X, y):
    indices = np.random.choice(len(X), len(X), replace=True)
    X_sampled, y_sampled = X.iloc[indices], y.iloc[indices]
    X_sampled.reset_index(drop=True, inplace=True)
    y_sampled.reset_index(drop=True, inplace=True)
    return X_sampled, y_sampled


def train_decision_tree(X, y, max_depth=None, max_features=None):
    tree = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    tree.fit(X, y)
    return tree


def predict_decision_tree(tree, X):
    return tree.predict(X)


def random_forest_with_grid_search(X_train, y_train, X_val, y_val, num_trees):
    max_depth_values = [None, 5, 10, 15, 20]
    max_features_values = ['sqrt', 'log2', 5]
    trees = []

    best_max_depth = None
    best_max_features = None
    best_accuracy = 0.0

    for max_depth in max_depth_values:
        for max_features in max_features_values:
            for _ in range(num_trees):
                X_bootstrap, y_bootstrap = create_bootstrap_sample(X_train, y_train)

                tree = train_decision_tree(X_bootstrap, y_bootstrap, max_depth=max_depth, max_features=max_features)

                trees.append(tree)

            predictions = np.array([predict_decision_tree(tree, X_val) for tree in trees])
            final_prediction = np.mean(predictions, axis=0).round()

            accuracy = accuracy_score(y_val, final_prediction)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_max_depth = max_depth
                best_max_features = max_features
            trees = []

    print(f"Best max_depth: {best_max_depth}")
    print(f"Best max_features: {best_max_features}")
    print(f"Best Accuracy: {best_accuracy}")
np.random.seed(500)

num_trees = 50
random_forest_with_grid_search(X_train, y_train, X_val, y_val, num_trees)


evaluating best pair results on test data


In [None]:
def random_forest_with_params(X_train, y_train, X_val, y_val, X_test, y_test, num_trees, best_max_depth, best_max_features):
    trees = []

    for _ in range(num_trees):
        X_bootstrap, y_bootstrap = create_bootstrap_sample(X_train, y_train)

        tree = train_decision_tree(X_bootstrap, y_bootstrap, max_depth=best_max_depth, max_features=best_max_features)

        trees.append(tree)
    predictions_val = np.array([predict_decision_tree(tree, X_val) for tree in trees])
    final_prediction_val = np.mean(predictions_val, axis=0).round()
    accuracy_val = accuracy_score(y_val, final_prediction_val)
    print(f"Validation Set Accuracy: {accuracy_val}")
    predictions_test = np.array([predict_decision_tree(tree, X_test) for tree in trees])

    final_prediction_test = np.mean(predictions_test, axis=0).round()

    accuracy_test = accuracy_score(y_test, final_prediction_test)
    print(f"Test Set Accuracy: {accuracy_test}")
    b=accuracy_test

best_max_depth = 10
best_max_features = 'log2'

random_forest_with_params(X_train, y_train, X_val, y_val, X_test, y_test, num_trees, best_max_depth, best_max_features)

the main idea of boosting is to train n weak learner and concentrate on letting those new weak learners focus on misclassified data from previous models and increasing their weight on their next learner with optimum learning rate
we use randomised search to find optimum learning rate and after finding we evaluate the model using test set


<h1>Boosting

In [None]:
class SimpleBoosting:
    def __init__(self, n_estimators=30, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []
        self.alphas = []

    def fit(self, X, y, weights):
        n_samples = len(X)

        for _ in range(self.n_estimators):
            weak_model = DecisionTreeClassifier(max_depth=1)

            weak_model.fit(X, y, sample_weight=weights)

            predictions = weak_model.predict(X)
            errors = np.sum(weights * (y != predictions))
            weighted_error = np.sum(errors)
            alpha = 0.5 * np.log((1 - weighted_error) / max(weighted_error, 1e-10))
            self.alphas.append(alpha)

            weights *= np.exp(-self.learning_rate * alpha * y * predictions)
            weights /= np.sum(weights)

            self.models.append(weak_model)

    def predict(self, X):
        predictions = np.sum([alpha * model.predict(X) for alpha, model in zip(self.alphas, self.models)], axis=0)
        return np.sign(predictions)

def evaluate_on_test_set(ensemble, X_test, y_test):
    predictions_test = ensemble.predict(X_test)
    accuracy = accuracy_score(y_test, predictions_test)
    print("Test Set Accuracy:", accuracy)
    c=accuracy

def randomized_search(X_train, y_train, X_val, y_val, n_trials=5):
    best_accuracy = 0
    best_learning_rate = None

    for _ in range(n_trials):
        learning_rate = np.random.uniform(0.01, 0.5)
        simple_boosting = SimpleBoosting(n_estimators=30, learning_rate=learning_rate)
        weights = np.ones(len(X_train)) / len(X_train)
        simple_boosting.fit(X_train, y_train, weights)

        predictions_val = simple_boosting.predict(X_val)
        accuracy_val = accuracy_score(y_val, predictions_val)

        print(f"Learning Rate: {learning_rate}, Validation Set Accuracy: {accuracy_val}")
        if accuracy_val > best_accuracy:
            best_accuracy = accuracy_val
            best_learning_rate = learning_rate

    print("\nBest Learning Rate:", best_learning_rate)

    simple_boosting_best = SimpleBoosting(n_estimators=30, learning_rate=best_learning_rate)
    weights_best = np.ones(len(X_train)) / len(X_train)
    simple_boosting_best.fit(X_train, y_train, weights_best)

    evaluate_on_test_set(simple_boosting_best, X_test, y_test)
randomized_search(X_train, y_train, X_val, y_val, n_trials=5)

In [None]:
if a < b < c:
    print("Boosting is optimal")
elif a > b > c:
    print("Bagging is optimal")
else:
    print("Random forest is optimal")


a:accuracy of bagging ensemble with best params on test set<br>
b:accuracy of random forest with best params on test set<br>
c:accuracy of boosting with best params on test set