In [11]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load dataset
data = pd.read_csv('C://Users//Jayashrinidhi V//OneDrive//Documents//VScode//AlgoMaster//encrypted_messages_dataset12.csv')
print(data.head())

  algorithm                                          plaintext  \
0       AES  The volunteerism is a key component of communi...   
1       AES  +"f&R*~?GE3%-'}}i5bMx"IB:dqN0?2/R)sHWm;\.:SP1$...   
2       AES  The educational technology is a key component ...   
3       AES                        [GmcvmxtqF4}`}wOs NyXORrP{*   
4       AES  The encryption is a key component of global po...   

                                                 key  \
0  06f50db57c46117664333a82aa4628adf24baf21367d26...   
1  5bbff7147513d5e3624560a71214653e3ebe1b330c41e1...   
2  e229117d280cafcec62d4cbf6555bdee64da661d5226c1...   
3  74c9b7ebe8cc4cf50c99c34ede237bf48cf3813c3f9569...   
4  2226799dab3df3b994d8ee4f86398ba251c99c25ac4b6a...   

                                 iv  \
0  8855be909cbf4c1cf3f646fcfe0bb375   
1  bfa2fe6369565e48493aecfe3c625f78   
2  d95b59ff2e2911759b468342e537eb26   
3  237b47f8047ba7480f365e62b16487f6   
4  b7c4f3f628754f520f27655145ed4d7c   

                               

In [None]:
# Define base models
base_models = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('svc', SVC(probability=True)),
    ('decision_tree', DecisionTreeClassifier()),
    ('random_forest', RandomForestClassifier()),
    ('gbm', GradientBoostingClassifier()),
    ('xgboost', xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)),
    ('catboost', CatBoostClassifier(silent=True)),
    ('knn', KNeighborsClassifier()),
    ('naive_bayes', MultinomialNB())
]

In [None]:
# Define hyperparameters for tuning
param_grids = {
    'logreg': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],  # Remove 'elasticnet' if not using 'saga'
        'solver': ['liblinear', 'saga'],  # Only use 'saga' for 'elasticnet' penalty
    },
    'svc': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto'],
    },
    'decision_tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5, 10],
    },
    'random_forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
    },
    'gbm': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    'xgboost': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
    },
    'catboost': {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [3, 5, 7],
        'l2_leaf_reg': [1, 3, 5, 7],
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski'],
    },
    'naive_bayes': {
        'alpha': [0.01, 0.1, 1, 10],
        'fit_prior': [True, False],
    }
}


In [None]:
# Encode the target variable
le = LabelEncoder()
data['algorithm_encoded'] = le.fit_transform(data['algorithm'])

# Define features and target
X = data['encrypted_message']
y = data['algorithm_encoded']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_features = vectorizer.fit_transform(X)

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)


# Hyperparameter Tuning

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
logreg_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

# Initialize Logistic Regression model
logreg = LogisticRegression(max_iter=500)

# Perform GridSearchCV
logreg_grid_search = GridSearchCV(estimator=logreg, param_grid=logreg_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
logreg_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_logreg = logreg_grid_search.best_estimator_
print(f"Best parameters for Logistic Regression: {logreg_grid_search.best_params_}")


Best parameters for Logistic Regression: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
logreg = LogisticRegression(max_iter=1000)

# Train the model
logreg.fit(X_train, y_train)
# Save the trained model
dump(logreg, 'logistic_regression_model.joblib')


['logistic_regression_model.joblib']

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on the test set
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        19

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
[[26  0]
 [ 0 19]]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load

# Define parameter grid for Logistic Regression with stronger regularization
logreg_param_grid = {
    'C': [0.001, 0.01, 0.1],  # Stronger regularization
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [50]
}

# Initialize Logistic Regression model
logreg = LogisticRegression(max_iter=1000)

# Perform GridSearchCV
logreg_grid_search = GridSearchCV(estimator=logreg, param_grid=logreg_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
logreg_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_logreg = logreg_grid_search.best_estimator_
print(f"Best parameters for Logistic Regression: {logreg_grid_search.best_params_}")

# Save the trained model
dump(best_logreg, 'logistic_regression_model_regularized.joblib')

# Predict on the test set
y_pred = best_logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Best parameters for Logistic Regression: {'C': 0.1, 'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        26
           1       1.00      1.00      1.00        19

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
[[26  0]
 [ 0 19]]


### SVC model

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define parameter grid for SVC
svc_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize SVC model
svc = SVC(probability=True)

# Perform GridSearchCV
svc_grid_search = GridSearchCV(estimator=svc, param_grid=svc_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
svc_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_svc = svc_grid_search.best_estimator_
print(f"Best parameters for SVC: {svc_grid_search.best_params_}")


Best parameters for SVC: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Decision Tree
decision_tree_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Decision Tree model
decision_tree = DecisionTreeClassifier()

# Perform GridSearchCV
decision_tree_grid_search = GridSearchCV(estimator=decision_tree, param_grid=decision_tree_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
decision_tree_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_decision_tree = decision_tree_grid_search.best_estimator_
print(f"Best parameters for Decision Tree: {decision_tree_grid_search.best_params_}")


Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
from joblib import dump, load
# Save the trained model to a file
dump(decision_tree, 'decision_tree_model.joblib')

['decision_tree_model.joblib']

In [None]:
from sklearn.utils.validation import check_is_fitted

# This will raise a NotFittedError if gbm is not fitted
check_is_fitted(gbm, 'estimators_')
# Assuming X_test is your test feature set
y_pred = gbm_loaded.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report (includes precision, recall, F1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



### Random Forest Classifier

In [None]:
"""from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
random_forest_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest model
random_forest = RandomForestClassifier()

# Perform GridSearchCV
random_forest_grid_search = GridSearchCV(estimator=random_forest, param_grid=random_forest_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
random_forest_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_random_forest = random_forest_grid_search.best_estimator_
print(f"Best parameters for Random Forest: {random_forest_grid_search.best_params_}")
"""
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Convert encrypted message from hexadecimal to a format usable for ML models
def hex_to_bytes(hex_string):
    return bytes.fromhex(hex_string)

data['encrypted_message_bytes'] = data['encrypted_message'].apply(hex_to_bytes)

# Create features (e.g., byte frequency histogram)
def create_features(byte_string):
    histogram = np.zeros(256, dtype=int)
    for byte in byte_string:
        histogram[byte] += 1
    return histogram

X = np.array([create_features(message) for message in data['encrypted_message_bytes']])
y = data['algorithm']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, class_weight='balanced')
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

        3DES       0.14      0.24      0.18        49
         AES       0.17      0.21      0.19        56
    Blowfish       0.12      0.09      0.10        66
    ChaCha20       0.29      0.25      0.27        64
         DES       0.35      0.22      0.27        65

    accuracy                           0.20       300
   macro avg       0.21      0.20      0.20       300
weighted avg       0.22      0.20      0.20       300

[[12  9 13 11  4]
 [12 12 14  9  9]
 [23 18  6 14  5]
 [15 17  8 16  8]
 [22 16  8  5 14]]


### Gradient Boosting

In [None]:
#this runs for 40 mins DO NOT RUN MORE THAN ONCE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Gradient Boosting
gbm_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Gradient Boosting model
gbm = GradientBoostingClassifier()

# Perform GridSearchCV
gbm_grid_search = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
gbm_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_gbm = gbm_grid_search.best_estimator_
print(f"Best parameters for Gradient Boosting: {gbm_grid_search.best_params_}")


In [None]:
from joblib import dump, load
# Save the trained model to a file
dump(gbm, 'gradient_boosting_model.joblib')

['gradient_boosting_model.joblib']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from joblib import dump, load

# Instantiate and fit the model
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)  # Train the model on your training data

# Save the trained model
dump(gbm, 'gradient_boosting_model.joblib')

# Load the model
gbm_loaded = load('gradient_boosting_model.joblib')

# Now make predictions with the loaded model
y_pred = gbm_loaded.predict(X_test)


In [None]:
from sklearn.utils.validation import check_is_fitted

# This will raise a NotFittedError if gbm is not fitted
check_is_fitted(gbm, 'estimators_')
# Assuming X_test is your test feature set
y_pred = gbm_loaded.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report (includes precision, recall, F1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.1800
Classification Report:
              precision    recall  f1-score   support

        3DES       0.17      0.22      0.19        49
         AES       0.16      0.20      0.18        56
    Blowfish       0.21      0.17      0.19        66
    ChaCha20       0.22      0.23      0.23        64
         DES       0.13      0.09      0.11        65

    accuracy                           0.18       300
   macro avg       0.18      0.18      0.18       300
weighted avg       0.18      0.18      0.18       300

Confusion Matrix:
[[11  8  7 13 10]
 [17 11  6 14  8]
 [13 14 11 15 13]
 [12 16 11 15 10]
 [13 19 17 10  6]]


### XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define parameter grid for XGBoost
xgboost_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgboost = xgb.XGBClassifier(eval_metric='mlogloss')

# Perform GridSearchCV
xgboost_grid_search = GridSearchCV(estimator=xgboost, param_grid=xgboost_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
xgboost_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_xgboost = xgboost_grid_search.best_estimator_
print(f"Best parameters for XGBoost: {xgboost_grid_search.best_params_}")


ValueError: 
All the 720 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Jayashrinidhi V\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Jayashrinidhi V\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\Jayashrinidhi V\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got ['3DES' 'AES' 'Blowfish' 'ChaCha20' 'DES']


### Catboost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for CatBoost
catboost_param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8]
}

# Initialize CatBoost model
catboost = CatBoostClassifier(silent=True)

# Perform GridSearchCV
catboost_grid_search = GridSearchCV(estimator=catboost, param_grid=catboost_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
catboost_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_catboost = catboost_grid_search.best_estimator_
print(f"Best parameters for CatBoost: {catboost_grid_search.best_params_}")


### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Perform GridSearchCV
knn_grid_search = GridSearchCV(estimator=knn, param_grid=knn_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
knn_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_knn = knn_grid_search.best_estimator_
print(f"Best parameters for KNN: {knn_grid_search.best_params_}")


### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Naive Bayes
naive_bayes_param_grid = {
    'alpha': [0.1, 0.5, 1.0]
}

# Initialize Naive Bayes model
naive_bayes = MultinomialNB()

# Perform GridSearchCV
naive_bayes_grid_search = GridSearchCV(estimator=naive_bayes, param_grid=naive_bayes_param_grid, cv=5, n_jobs=-1, scoring='accuracy')
naive_bayes_grid_search.fit(X_train, y_train)

# Best estimator and parameters
best_naive_bayes = naive_bayes_grid_search.best_estimator_
print(f"Best parameters for Naive Bayes: {naive_bayes_grid_search.best_params_}")


### Evaluate model

In [None]:
# Evaluate the tuned models
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.2f}")


# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

# Use the hyperparameter-tuned models
tuned_models = [
    ('logreg', LogisticRegression(C=0.01, penalty='l1', solver='liblinear', max_iter=1000)),
    ('svc', SVC(C=0.1, gamma='scale', kernel='linear', probability=True)),
    ('decision_tree', DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_split=10)),
    ('random_forest', RandomForestClassifier(n_estimators=200, max_depth=5, min_samples_split=10, random_state=42)),
    ('gbm', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)),
    ('xgboost', xgb.XGBClassifier(eval_metric='mlogloss', n_estimators=100, learning_rate=0.1, max_depth=3)),
    ('catboost', CatBoostClassifier(silent=True, iterations=100, learning_rate=0.1, depth=3)),
    ('knn', KNeighborsClassifier(n_neighbors=3)),
    ('naive_bayes', MultinomialNB(alpha=0.1))
]

# Create the voting classifier
voting_clf = VotingClassifier(estimators=tuned_models, voting='soft')

# Fit the voting classifier
voting_clf.fit(X_train, y_train)

# Evaluate the voting classifier
voting_accuracy = voting_clf.score(X_test, y_test)
print(f'Voting Classifier Accuracy: {voting_accuracy}')


# Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Create the stacking classifier
stacking_clf = StackingClassifier(estimators=tuned_models, final_estimator=LogisticRegression())

# Fit the stacking classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the stacking classifier
stacking_accuracy = stacking_clf.score(X_test, y_test)
print(f'Stacking Classifier Accuracy: {stacking_accuracy}')


# Bagging and Boosting

In [None]:
from sklearn.ensemble import BaggingClassifier

# Bagging using DecisionTreeClassifier
bagging_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_split=10), n_estimators=50, random_state=42)

# Fit the bagging classifier
bagging_clf.fit(X_train, y_train)

# Evaluate the bagging classifier
bagging_accuracy = bagging_clf.score(X_test, y_test)
print(f'Bagging Classifier Accuracy: {bagging_accuracy}')


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

# Gradient Boosting
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbm.fit(X_train, y_train)
gbm_accuracy = gbm.score(X_test, y_test)
print(f'Gradient Boosting Classifier Accuracy: {gbm_accuracy}')

# XGBoost
xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss', n_estimators=100, learning_rate=0.1, max_depth=3)
xgb_clf.fit(X_train, y_train)
xgb_accuracy = xgb_clf.score(X_test, y_test)
print(f'XGBoost Classifier Accuracy: {xgb_accuracy}')


# Compare Voting, Stacking, Bag & Boost

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.model_selection import cross_val_score
import numpy as np

# Define function to evaluate and compare models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    return accuracy

# Voting Classifier
print("Evaluating Voting Classifier...")
voting_accuracy = evaluate_model(voting_clf, X_train, y_train, X_test, y_test)

# Stacking Classifier
print("\nEvaluating Stacking Classifier...")
stacking_accuracy = evaluate_model(stacking_clf, X_train, y_train, X_test, y_test)

# Bagging and Boosting Classifiers
print("\nEvaluating Bagging/Boosting Classifiers...")
bagging_boosting_accuracies = []
for model_name, model in bagging_boosting_classifiers.items():
    print(f"\nEvaluating {model_name}...")
    accuracy = evaluate_model(model, X_train, y_train, X_test, y_test)
    bagging_boosting_accuracies.append((model_name, accuracy))

# Comparison Summary
print("\nComparison Summary:")
print(f"Voting Classifier Accuracy: {voting_accuracy:.4f}")
print(f"Stacking Classifier Accuracy: {stacking_accuracy:.4f}")
for model_name, accuracy in bagging_boosting_accuracies:
    print(f"{model_name} Accuracy: {accuracy:.4f}")

# Compute Average Accuracy for Bagging/Boosting
average_bagging_boosting_accuracy = np.mean([acc[1] for acc in bagging_boosting_accuracies])
print(f"Average Bagging/Boosting Accuracy: {average_bagging_boosting_accuracy:.4f}")

# Additional Metrics (optional)
print("\nAdditional Metrics:")
print("ROC AUC Score for Voting Classifier:", roc_auc_score(y_test, voting_clf.predict_proba(X_test), multi_class='ovr'))
print("ROC AUC Score for Stacking Classifier:", roc_auc_score(y_test, stacking_clf.predict_proba(X_test), multi_class='ovr'))
