In [None]:
import random
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from itertools import combinations,product
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score



Load the processed dataset

In [None]:
processed_data_path = "processed_data.csv"
df = pd.read_csv(processed_data_path)

Separate features (X) and target (y)

In [None]:
target_column = 'smoking'
X = df.drop(columns=[target_column])  # Features
y = df[target_column]

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

For enhancing the reliability and precision of predictive models

In [None]:
# Manual Bagging Implementation
def manual_bagging(X_train, y_train, X_test, y_test, n_estimators=10,max_depth=None, random_seed=42):

    # Check and handle missing values in y_train
    if y_train.isnull().any():
        valid_indices = y_train.notnull()
        X_train = X_train[valid_indices]
        y_train = y_train[valid_indices]

    np.random.seed(random_seed)  # Set random seed for reproducibility
    estimators = []

    for i in range(n_estimators):
        # Bootstrap sampling
        indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
        X_sample = X_train.iloc[indices]
        y_sample = y_train.iloc[indices]

        # Train a decision tree as base estimator
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=random_seed + i)
        tree.fit(X_sample, y_sample)
        estimators.append(tree)

    # Aggregate predictions
    predictions = np.zeros((X_test.shape[0], n_estimators))
    for i, tree in enumerate(estimators):
        predictions[:, i] = tree.predict(X_test)

    # Majority vote
    final_predictions = [np.bincount(row.astype(int)).argmax() for row in predictions]

    # Evaluate the model
    accuracy = accuracy_score(y_test, final_predictions)
    report = classification_report(y_test, final_predictions)
    confusion = confusion_matrix(y_test, final_predictions)

    # Print evaluation results
    print("\nBagging Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", confusion)

    return final_predictions


# Example usage:
y_pred_bagging = manual_bagging(X_train, y_train, X_test, y_test, n_estimators=10)



Bagging Results:
Accuracy: 0.7102
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.77      0.75     13327
           1       0.69      0.63      0.66     10562

    accuracy                           0.71     23889
   macro avg       0.71      0.70      0.70     23889
weighted avg       0.71      0.71      0.71     23889

Confusion Matrix:
 [[10317  3010]
 [ 3914  6648]]


In [None]:
def manual_boosting(X_train, y_train, X_test, y_test, n_estimators=50,max_depth=1):
    n_samples = X_train.shape[0]
    weights = np.ones(n_samples) / n_samples  # Initialize weights to 1/N for all samples

    models = []
    model_weights = []

    # Convert labels to {-1, 1} for boosting
    y_train = 2 * y_train - 1  # This converts 0s to -1s for boosting logic

    for _ in range(n_estimators):
        # Fit a weak learner with weighted samples
        model = DecisionTreeClassifier(max_depth=max_depth)
        model.fit(X_train, y_train, sample_weight=weights)
        predictions = model.predict(X_train)

        # Calculate error rate: weighted error
        incorrect = (predictions != y_train).astype(int)
        error = np.dot(weights, incorrect) / np.sum(weights)

        # Stop if the error is too high
        if error > 0.5:
            break

        # Calculate alpha (model weight)
        alpha = 0.5 * np.log((1 - error) / error)
        models.append(model)
        model_weights.append(alpha)

        # Update sample weights
        weights *= np.exp(-alpha * y_train * predictions)  # Increase weight for misclassified samples
        weights /= np.sum(weights)  # Normalize the weights

    # Make predictions on the test set
    final_predictions = np.zeros(X_test.shape[0])
    for model, alpha in zip(models, model_weights):
        final_predictions += alpha * model.predict(X_test)

    # Convert the predictions back to {0, 1}
    final_predictions = (final_predictions >= 0).astype(int)

    # Evaluate the model
    accuracy = accuracy_score(y_test, final_predictions)
    report = classification_report(y_test, final_predictions)
    confusion = confusion_matrix(y_test, final_predictions)

    print("\nManual Boosting Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", confusion)

    return final_predictions

# Example usage:
# Assuming X_train, y_train, X_test, and y_test are defined elsewhere
manual_boosting_predictions = manual_boosting(X_train, y_train, X_test, y_test, n_estimators=50)


Manual Boosting Results:
Accuracy: 0.7378
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76     13327
           1       0.69      0.74      0.71     10562

    accuracy                           0.74     23889
   macro avg       0.74      0.74      0.74     23889
weighted avg       0.74      0.74      0.74     23889

Confusion Matrix:
 [[9812 3515]
 [2748 7814]]


In [None]:
def random_forest(X_train, y_train, X_test, y_test, n_estimators=10, max_depth=None, random_seed=42):
    np.random.seed(random_seed)
    n_samples = X_train.shape[0]

    # Initialize a list to store the individual decision trees
    trees = []

    # Train n_estimators trees on different bootstrap samples
    for _ in range(n_estimators):
        # Randomly sample the data (bootstrap sampling) using indices
        bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_bootstrap = X_train.iloc[bootstrap_indices]  # Assuming X_train is a DataFrame
        y_bootstrap = y_train.iloc[bootstrap_indices]  # Assuming y_train is a DataFrame or Series

        # Train a decision tree on the bootstrap sample
        tree = DecisionTreeClassifier(max_depth=max_depth)
        tree.fit(X_bootstrap, y_bootstrap)
        trees.append(tree)

    # Make predictions for the test set by aggregating predictions from all trees
    tree_predictions = np.array([tree.predict(X_test) for tree in trees])

    # Majority voting: Take the most frequent prediction across all trees
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=tree_predictions)

    # Evaluate the model
    accuracy = accuracy_score(y_test, final_predictions)
    report = classification_report(y_test, final_predictions)
    confusion = confusion_matrix(y_test, final_predictions)

    print("\nRandom Forest Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", confusion)

    return  final_predictions

# Example usage:
# Assuming X_train, y_train, X_test, and y_test are pandas DataFrames or Series
random_forest_predictions = random_forest(X_train, y_train, X_test, y_test, n_estimators=10, max_depth=None)


Random Forest Results:
Accuracy: 0.7075
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.77      0.75     13327
           1       0.69      0.63      0.65     10562

    accuracy                           0.71     23889
   macro avg       0.70      0.70      0.70     23889
weighted avg       0.71      0.71      0.71     23889

Confusion Matrix:
 [[10295  3032]
 [ 3955  6607]]


In [None]:
# Grid Search for Manual Bagging
def grid_search_bagging(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [10, 20],
        'max_depth': [None, 5, 10],
    }

    best_score = 0
    best_params = None
    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        predictions = manual_bagging(X_train, y_train, X_test, y_test, **param_dict)
        accuracy = accuracy_score(y_test, predictions)
        if accuracy > best_score:
            best_score = accuracy
            best_params = param_dict

    print("\nBest Parameters for Manual Bagging:", best_params)
    print("Best Accuracy:", best_score)

# Grid Search for Manual Boosting
def grid_search_boosting(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [1, 3],
    }

    best_score = 0
    best_params = None
    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        predictions = manual_bagging(X_train, y_train, X_test, y_test, **param_dict)
        accuracy = accuracy_score(y_test, predictions)
        if accuracy > best_score:
            best_score = accuracy
            best_params = param_dict

    print("\nBest Parameters for Manual Boosting:", best_params)
    print("Best Accuracy:", best_score)


# Grid Search for Random Forest
def grid_search_random_forest(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [10, 20],
        'max_depth': [None, 5, 10],
    }

    best_score = 0
    best_params = None
    for params in product(*param_grid.values()):
        param_dict = dict(zip(param_grid.keys(), params))
        predictions = manual_bagging(X_train, y_train, X_test, y_test, **param_dict)
        accuracy = accuracy_score(y_test, predictions)
        if accuracy > best_score:
            best_score = accuracy
            best_params = param_dict

    print("\nBest Parameters for Random Forest:", best_params)
    print("Best Accuracy:", best_score)


# Example Usage
print("Tuning Manual Bagging...")
grid_search_bagging(X_train, y_train, X_test, y_test)

print("\nTuning Manual Boosting...")
grid_search_boosting(X_train, y_train, X_test, y_test)

print("\nTuning Manual Random Forest...")
grid_search_random_forest(X_train, y_train, X_test, y_test)




Tuning Manual Bagging...

Bagging Results:
Accuracy: 0.7102
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.77      0.75     13327
           1       0.69      0.63      0.66     10562

    accuracy                           0.71     23889
   macro avg       0.71      0.70      0.70     23889
weighted avg       0.71      0.71      0.71     23889

Confusion Matrix:
 [[10317  3010]
 [ 3914  6648]]

Bagging Results:
Accuracy: 0.7350
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.72      0.75     13327
           1       0.68      0.75      0.71     10562

    accuracy                           0.73     23889
   macro avg       0.73      0.74      0.73     23889
weighted avg       0.74      0.73      0.74     23889

Confusion Matrix:
 [[9652 3675]
 [2656 7906]]

Bagging Results:
Accuracy: 0.7454
Classification Report:
               precision    recall  f1-score   su

In [None]:
def randomized_search_bagging(X_train, y_train, X_test, y_test):
    param_dist = {
        'n_estimators': [10, 20, 30, 40, 50],
        'max_depth': [None, 5, 10, 15, 20],
    }

    best_score = 0
    best_params = None

    # Randomized Search for Bagging
    for n_estimators in param_dist['n_estimators']:
        for max_depth in param_dist['max_depth']:
            predictions = manual_bagging(X_train, y_train, X_test, y_test, n_estimators=n_estimators, max_depth=max_depth)
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_score:
                best_score = accuracy
                best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}

    print("\nBest Parameters for Manual Bagging:", best_params)
    print("Best Accuracy:", best_score)

# Randomized Search for Manual Boosting
def randomized_search_boosting(X_train, y_train, X_test, y_test):
    param_dist = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [1, 3, 5, 7],
    }

    best_score = 0
    best_params = None

    # Randomized Search for Boosting
    for n_estimators in param_dist['n_estimators']:
        for max_depth in param_dist['max_depth']:
            predictions = manual_boosting(X_train, y_train, X_test, y_test, n_estimators=n_estimators, max_depth=max_depth)
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_score:
                best_score = accuracy
                best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}

    print("\nBest Parameters for Manual Boosting:", best_params)
    print("Best Accuracy:", best_score)

# Randomized Search for Random Forest
def randomized_search_random_forest(X_train, y_train, X_test, y_test):
    param_dist = {
        'n_estimators': [10, 20, 30, 40, 50],
        'max_depth': [None, 5, 10, 15],
    }

    best_score = 0
    best_params = None

    # Randomized Search for Random Forest
    for n_estimators in param_dist['n_estimators']:
        for max_depth in param_dist['max_depth']:
            predictions = random_forest(X_train, y_train, X_test, y_test, n_estimators=n_estimators, max_depth=max_depth)
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_score:
                best_score = accuracy
                best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}

    print("\nBest Parameters for Random Forest:", best_params)
    print("Best Accuracy:", best_score)


# Example Usage
print("Tuning Manual Bagging...")
randomized_search_bagging(X_train, y_train, X_test, y_test)

print("\nTuning Manual Boosting...")
randomized_search_boosting(X_train, y_train, X_test, y_test)

print("\nTuning Manual Random Forest...")
randomized_search_random_forest(X_train, y_train, X_test, y_test)


Tuning Manual Bagging...

Bagging Results:
Accuracy: 0.7102
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.77      0.75     13327
           1       0.69      0.63      0.66     10562

    accuracy                           0.71     23889
   macro avg       0.71      0.70      0.70     23889
weighted avg       0.71      0.71      0.71     23889

Confusion Matrix:
 [[10317  3010]
 [ 3914  6648]]

Bagging Results:
Accuracy: 0.7350
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.72      0.75     13327
           1       0.68      0.75      0.71     10562

    accuracy                           0.73     23889
   macro avg       0.73      0.74      0.73     23889
weighted avg       0.74      0.73      0.74     23889

Confusion Matrix:
 [[9652 3675]
 [2656 7906]]

Bagging Results:
Accuracy: 0.7454
Classification Report:
               precision    recall  f1-score   su

In [None]:
# Evaluate Bagging
bagging = manual_bagging(X_train, y_train, X_val, y_val, n_estimators=10, max_depth=5)
print(f"Bagging Accuracy: {bagging}")

# Evaluate Boosting
boosting = manual_boosting(X_train, y_train, X_val, y_val, n_estimators=50, max_depth=1)
print(f"Boosting Accuracy: {boosting}")

# Evaluate Random Forest
random_forest = random_forest(X_train, y_train, X_val, y_val, n_estimators=10, max_depth=5)
print(f"Random Forest Accuracy: {random_forest}")


Bagging Results:
Accuracy: 0.7378
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.73      0.76     13359
           1       0.69      0.75      0.72     10529

    accuracy                           0.74     23888
   macro avg       0.74      0.74      0.74     23888
weighted avg       0.74      0.74      0.74     23888

Confusion Matrix:
 [[9729 3630]
 [2633 7896]]
Bagging Accuracy: [0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 

In [None]:
bagging_accuracy = accuracy_score(y_val, bagging)

boosting_accuracy = accuracy_score(y_val, boosting)

random_forest_accuracy = accuracy_score(y_val, random_forest)

if bagging_accuracy > boosting_accuracy and bagging_accuracy > random_forest_accuracy:
    print("Bagging is the best model.")
    final_model = 'bagging'
elif boosting_accuracy > random_forest_accuracy:
    print("Boosting is the best model.")
    final_model = 'boosting'
else:
    print("Random Forest is the best model.")
    final_model = 'random_forest'


Bagging Results:
Accuracy: 0.7378
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.73      0.76     13359
           1       0.69      0.75      0.72     10529

    accuracy                           0.74     23888
   macro avg       0.74      0.74      0.74     23888
weighted avg       0.74      0.74      0.74     23888

Confusion Matrix:
 [[9729 3630]
 [2633 7896]]

Manual Boosting Results:
Accuracy: 0.7381
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76     13359
           1       0.69      0.74      0.71     10529

    accuracy                           0.74     23888
   macro avg       0.74      0.74      0.74     23888
weighted avg       0.74      0.74      0.74     23888

Confusion Matrix:
 [[9825 3534]
 [2723 7806]]

Random Forest Results:
Accuracy: 0.7378
Classification Report:
               precision    recall  f1-score   support

        

In [None]:
X_final_train = pd.concat([X_train, X_val], axis=0)
y_final_train = pd.concat([y_train, y_val], axis=0)

if final_model == 'bagging':
    final_predictions = manual_bagging(X_final_train, y_final_train, X_test, y_test, n_estimators=10, max_depth=5)
elif final_model == 'boosting':
    final_predictions = manual_boosting(X_final_train, y_final_train, X_test, y_test, n_estimators=50, max_depth=1)
else:  # random_forest
    final_predictions = random_forest(X_final_train, y_final_train, X_test, y_test, n_estimators=10, max_depth=5)


Manual Boosting Results:
Accuracy: 0.7376
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76     13327
           1       0.69      0.74      0.71     10562

    accuracy                           0.74     23889
   macro avg       0.74      0.74      0.74     23889
weighted avg       0.74      0.74      0.74     23889

Confusion Matrix:
 [[9805 3522]
 [2746 7816]]


In [None]:
final_accuracy =  accuracy_score(y_test, final_predictions)
final_report = classification_report(y_test, final_predictions)
final_confusion = confusion_matrix(y_test, final_predictions)

print("\nFinal Model Performance:")
print(f"Accuracy: {final_accuracy:.4f}")
print("Classification Report:\n", final_report)
print("Confusion Matrix:\n", final_confusion)


Final Model Performance:
Accuracy: 0.7376
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.74      0.76     13327
           1       0.69      0.74      0.71     10562

    accuracy                           0.74     23889
   macro avg       0.74      0.74      0.74     23889
weighted avg       0.74      0.74      0.74     23889

Confusion Matrix:
 [[9805 3522]
 [2746 7816]]
