<a href="https://colab.research.google.com/github/Mohamedragih1/Smoking-Status-Prediction/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing essential libraries

In [None]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.7-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.7 scikit-optimize-0.9.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

# Reading Data

In [None]:
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")
X_val = pd.read_csv("X_validate.csv")
y_val = pd.read_csv("y_validate.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")
y_train = np.ravel(y_train)
y_val = np.ravel(y_val)
y_test = np.ravel(y_test)

#Modeling

###Evaluate function

In [None]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    return accuracy

###Bagging model

In [None]:
class BaggingClassifier:
      def __init__(self, n_estimators=10, max_features=0.5, max_depth=5):
          self.n_estimators = n_estimators
          self.max_features = max_features
          self.max_depth = max_depth
          self.estimators = []

      def fit(self, X, y):
          for _ in range(self.n_estimators):
              indices = np.random.choice(len(X), len(X), replace=True)
              X_bootstrap = X.iloc[indices]
              y_bootstrap = y[indices]
              estimator = DecisionTreeClassifier(max_features=self.max_features, max_depth=self.max_depth).fit(X_bootstrap, y_bootstrap)
              self.estimators.append(estimator)

      def predict(self, X):
          # Make predictions using all the base classifiers
          predictions = [estimator.predict(X) for estimator in self.estimators]
          # Aggregate predictions using majority voting
          majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

          return majority_votes

      def get_params(self, deep=True):
          return {
              'n_estimators': self.n_estimators,
              'max_features': self.max_features,
              'max_depth': self.max_depth
          }

      def set_params(self, **params):
          for param, value in params.items():
              setattr(self, param, value)
          return self

In [None]:
basic_model = DecisionTreeClassifier()
basic_model.fit(X_train,y_train)
prediction=basic_model.predict(X_val)
print(f"Accuarcy : {accuracy_score(y_val,prediction):.2%}")

Accuarcy : 67.10%


In [None]:
model = BaggingClassifier(n_estimators=15, max_features = 0.7, max_depth = None)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy : {accuracy_score(y_val,prediction):.2%}")

Accuarcy : 72.72%


###AdaBoost model

In [None]:
class AdaBoostClassifier:
    def __init__(self, n_estimators, learning_rate):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.estimators = []
        self.weights = []

    def fit(self, X, y):
        self.weights = np.ones(len(X)) / len(X)
        for _ in range(self.n_estimators):
            estimator = DecisionTreeClassifier(max_depth=5).fit(X, y, sample_weight=self.weights)

            predictions = estimator.predict(X)
            incorrect = (predictions != y)
            error_rate = np.dot(self.weights, incorrect)

            alpha = self.learning_rate * np.log((1 - error_rate) / error_rate)
            self.estimators.append((estimator, alpha))

            self.weights *= np.exp(-y * alpha * predictions)

            self.weights /= np.sum(self.weights)

    def predict(self, X):
        predictions = np.empty(len(X))
        for estimator, alpha in self.estimators:
            predictions += alpha * estimator.predict(X)
        return np.sign(predictions)

    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 'learning_rate': self.learning_rate}

    def set_params(self, **params):
        if 'n_estimators' in params:
            self.n_estimators = params['n_estimators']
        if 'learning_rate' in params:
            self.learning_rate = params['learning_rate']
        return self

In [None]:
model = AdaBoostClassifier( n_estimators = 100, learning_rate = 1.0)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy : {accuracy_score(y_val,prediction):.2%}")

Accuarcy : 73.84%


###Random Forest model

In [None]:
class RandomForestClassifier:
    def __init__(self, n_estimators=400, max_features='auto', max_depth=10, min_samples_split=5):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.estimators = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X.iloc[indices]
            y_bootstrap = y[indices]
            estimator = DecisionTreeClassifier(max_features=self.max_features, max_depth=self.max_depth, min_samples_split = self.min_samples_split).fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)

    def predict(self, X):
        predictions = [estimator.predict(X) for estimator in self.estimators]
        majority_votes = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
        return majority_votes

    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_features': self.max_features,
            'min_samples_split': self.min_samples_split,
            'max_depth': self.max_depth
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [None]:
model = RandomForestClassifier(n_estimators=200, max_features=0.7, max_depth=20, min_samples_split=10)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy : {accuracy_score(y_val,prediction):.2%}")

Accuarcy : 74.98%


# Hyperparameters Tuning

## Grid Search

In [None]:
def gridSearch(model, param_grid, X, y):
    score = make_scorer(accuracy_score)
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring=score)
    grid_search.fit(X, y)
    return grid_search.best_estimator_

## Bagging

In [None]:
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_features': [None, 0.5, 0.7],
    'base_estimator__max_depth': [3, 7, 10]
}

model = BaggingClassifier()
best_model = gridSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Bagging best parameters in Grid Search: {best_model.get_params()}")

Bagging best parameters in Grid Search: {'n_estimators': 200, 'max_features': 0.7, 'max_depth': 5}


In [None]:
model = BaggingClassifier(n_estimators = 200, max_features = 0.7, max_depth = 5)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after Grid Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after Grid Search: 73.50%


##Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.1, 1.0]
}
model = AdaBoostClassifier()
best_model = gridSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Boosting best parameters in Grid Search: {best_model.best_estimator_}")

Boosting best parameters in Grid Search: AdaBoostClassifier(n_estimators=500)


In [None]:
model =AdaBoostClassifier(n_estimators = 500, learning_rate = 1.0)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after Grid Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after Grid Search: 74.57%


##Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 0.5, 0.7],
    'max_depth': [10, 15, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestClassifier()
best_model = gridSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Random Forest best parameters in Grid Search: {best_model.get_params()}")

Random Forest best parameters in Grid Search: {'n_estimators': 500, 'max_features': 0.7, 'min_samples_split': 10, 'max_depth': 20}


In [None]:
model =RandomForestClassifier(n_estimators = 500, max_features = 0.7, min_samples_split = 10, max_depth = 20)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after Grid Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after Grid Search: 74.94%


## Randomized Search

In [None]:
def randomizedSearch(model, param_distributions, X, y):
    score = make_scorer(accuracy_score)
    randomized_search = RandomizedSearchCV(model, param_distributions, n_iter=1, cv=5, n_jobs=-1, random_state=42, scoring=score)
    randomized_search.fit(X, y)
    return randomized_search.best_estimator_

##Bagging

In [None]:
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_features': [None, 0.5, 0.7],
    'base_estimator__max_depth': [3, 7, 10]
}

model = BaggingClassifier()
best_model = randomizedSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Bagging best parameters in Randomized Search: {best_model.get_params()}")

Bagging best parameters in Randomized Search: {'n_estimators': 100, 'max_features': 0.7, 'max_depth': 5}


In [None]:
model = BaggingClassifier(n_estimators = 100, max_features = 0.7, max_depth = 5)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after Randomized Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after Randomized Search: 73.62%


##Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.1, 1.0]
}
model = AdaBoostClassifier()
best_model = randomizedSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Boosting best parameters in Randomized Search: {best_model.best_estimator_}")

Boosting best parameters in Randomized Search: AdaBoostClassifier(n_estimators=100)


In [None]:
model =AdaBoostClassifier(n_estimators = 100, learning_rate = 1.0)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after Randomized Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after Randomized Search: 74.21%


##Random Forest

In [None]:
param_grid = {
    'n_estimators': [200, 300, 400, 500, 600],
    'max_features': ['auto', 0.5, 0.7],
    'max_depth': [15, 20, 30, 40],
    'min_samples_split': [5, 10, 15, 20]
}

model = RandomForestClassifier()
best_model = randomizedSearch(model, param_grid, X_train, y_train)

# Hyperparameter tuning
print(f"Random Forest best parameters in Randomized Search: {best_model.get_params()}")

In [None]:
print(f"Random Forest best parameters in Randomized Search: {best_model.get_params()}")

Random Forest best parameters in Randomized Search: {'n_estimators': 600, 'max_features': 'auto', 'min_samples_split': 20, 'max_depth': 30}


In [None]:
print(f"Random Forest best parameters in Randomized Search: {best_model.get_params()}")
model =RandomForestClassifier(n_estimators = 600, max_features = 'auto', min_samples_split = 20, max_depth = 30)
model.fit(X_train,y_train)
prediction=model.predict(X_val)
print(f"Accuarcy after randomized Search: {accuracy_score(y_val,prediction):.2%}")

In [None]:
print(f"Accuarcy after randomized Search: {accuracy_score(y_val,prediction):.2%}")

Accuarcy after randomized Search: 74.95%


## Bayes Search

In [None]:
def bayesianSearch(model, search_space, X, y):
    bayes_search = BayesSearchCV(model, search_space, n_iter=1, cv=5, random_state=42, n_jobs=-1, scoring='accuracy' )
    bayes_search.fit(X, y)
    return bayes_search.best_estimator_, bayes_search.best_params_

##Bagging

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_features': [0.5, 0.7, 0.9],
    'base_estimator__max_depth': [5, 10, 20]
}

In [None]:
model = BaggingClassifier()

# Perform hyperparameter tuning using Bayesian optimization
bayes_model, model_params = bayesianSearch(model, param_grid, X_train, y_train)

# Print the best parameters found for Bagging Classifier
print("Best parameters of Bagging :",model_params)

Best parameters of Bagging : OrderedDict([('base_estimator__max_depth', 20), ('max_features', 0.7), ('n_estimators', 200)])


In [None]:
score = evaluate_model(bayes_model, X_val, y_val)
print(f"Bagging after Bayes  Search: {score:.2%}")

Bagging after Bayes  Search: 73.58%


## Boosting

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.1, 1.0]
}

In [None]:
model = AdaBoostClassifier()

# Perform hyperparameter tuning using Bayesian optimization
bayes_model, model_params = bayesianSearch(model, param_grid, X_train, y_train)

# Print the best parameters found for Bagging Classifier
print("Best parameters of Boosting :",model_params)

Best parameters of Boosting : OrderedDict([('learning_rate', 1.0), ('n_estimators', 200)])


In [None]:
score = evaluate_model(bayes_model, X_val, y_val)
print(f"Boosting after Bayes  Search: {score:.2%}")

Boosting after Bayes  Search: 74.48%


##Random Forest

In [None]:
param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_features': [0.5, 0.7, 0.9],
    'max_depth': [15, 20, 30, 40],
    'min_samples_split': [5, 10, 15, 20]
}

In [None]:
model = RandomForestClassifier()

# Perform hyperparameter tuning using Bayesian optimization
bayes_model, model_params = bayesianSearch(model, param_grid, X_train, y_train)



Best parameters of Bagging : OrderedDict([('max_depth', 30), ('max_features', 0.7), ('min_samples_split', 20), ('n_estimators', 500)])


In [None]:
score = evaluate_model(bayes_model, X_val, y_val)
print(f"Random Forest after Bayes  Search: {score:.2%}")

Best parameters of Random Forest : OrderedDict([('max_depth', 30), ('max_features', 0.7), ('min_samples_split', 20), ('n_estimators', 500)])
Random Forest after Bayes  Search: 74.65%


#Final System

In [None]:
#Random Forest with the following parameters got the largest validation accuarcy
model =RandomForestClassifier(n_estimators = 600, max_features = 'auto', min_samples_split = 20, max_depth = 30)
model.fit(X_train,y_train)

In [None]:
#Printing final accuarcy
prediction=model.predict(X_test)
print(f"Final System Accuarcy: {accuracy_score(y_test,prediction):.2%}")

Final System Accuarcy: 75.02%
