# **Preparing Packages**

In [1]:
# Loading Packages:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **DataSet Loading**

In [2]:
# Loading DataSet:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment_02/DataSet/magic_data.csv")
df

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610,g
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620,g
...,...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258,h
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560,h
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,h
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166,h


# **DataSet Balancing & Splitting**

In [3]:
# Balancing The DataSet:
gamma = df[df['class'] == 'g']
hadron = df[df['class'] == 'h']
gamma = gamma.sample(n=len(hadron), random_state=42)
balanced_df = pd.concat([gamma, hadron]).sample(frac=1)

In [4]:
# Splitting The DataSet:
X = balanced_df.drop('class', axis=1)
y = balanced_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# **Decision Tree Classifier**

In [5]:
# Implementing Decision Tree Classifier:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

dt_results = {
    'accuracy': accuracy_score(y_test, dt_pred),
    'precision': precision_score(y_test, dt_pred, pos_label='g'),
    'recall': recall_score(y_test, dt_pred, pos_label='g'),
    'f1': f1_score(y_test, dt_pred, pos_label='g'),
    'confusion_matrix': confusion_matrix(y_test, dt_pred, labels=['g', 'h']).tolist()
}

print("DecisionTree:")
print(f"Accuracy: {dt_results['accuracy']:.4f}")
print(f"Precision: {dt_results['precision']:.4f}")
print(f"Recall: {dt_results['recall']:.4f}")
print(f"F1-Score: {dt_results['f1']:.4f}")
print(f"Confusion Matrix: {dt_results['confusion_matrix']}")

DecisionTree:
Accuracy: 0.7907
Precision: 0.7854
Recall: 0.7987
F1-Score: 0.7920
Confusion Matrix: [[1599, 403], [437, 1574]]


# **Naive Bayes Classifier**

In [6]:
# Implementing Naive Bayes Classifier:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

nb_results = {
    'accuracy': accuracy_score(y_test, nb_pred),
    'precision': precision_score(y_test, nb_pred, pos_label='g'),
    'recall': recall_score(y_test, nb_pred, pos_label='g'),
    'f1': f1_score(y_test, nb_pred, pos_label='g'),
    'confusion_matrix': confusion_matrix(y_test, nb_pred, labels=['g', 'h']).tolist()
}

print("NaiveBayes:")
print(f"Accuracy: {nb_results['accuracy']:.4f}")
print(f"Precision: {nb_results['precision']:.4f}")
print(f"Recall: {nb_results['recall']:.4f}")
print(f"F1-Score: {nb_results['f1']:.4f}")
print(f"Confusion Matrix: {nb_results['confusion_matrix']}")

NaiveBayes:
Accuracy: 0.6534
Precision: 0.6030
Recall: 0.8936
F1-Score: 0.7201
Confusion Matrix: [[1789, 213], [1178, 833]]


# **Random Forest Classifier**

In [7]:
# Implementing Random Forest Classifier:
rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, {'n_estimators': [50, 100, 200]}, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)

rf_results = {
    'accuracy': accuracy_score(y_test, rf_pred),
    'precision': precision_score(y_test, rf_pred, pos_label='g'),
    'recall': recall_score(y_test, rf_pred, pos_label='g'),
    'f1': f1_score(y_test, rf_pred, pos_label='g'),
    'confusion_matrix': confusion_matrix(y_test, rf_pred, labels=['g', 'h']).tolist()
}

print("RandomForest:")
print(f"Accuracy: {rf_results['accuracy']:.4f}")
print(f"Precision: {rf_results['precision']:.4f}")
print(f"Recall: {rf_results['recall']:.4f}")
print(f"F1-Score: {rf_results['f1']:.4f}")
print(f"Confusion Matrix: {rf_results['confusion_matrix']}")

RandomForest:
Accuracy: 0.8592
Precision: 0.8362
Recall: 0.8926
F1-Score: 0.8635
Confusion Matrix: [[1787, 215], [350, 1661]]


# **AdaBoost Classifier**

In [8]:
# Implementing AdaBoost Classifier:
ab_model = AdaBoostClassifier(random_state=42)
ab_grid = GridSearchCV(ab_model, {'n_estimators': [50, 100, 200]}, cv=5, scoring='accuracy')
ab_grid.fit(X_train, y_train)
ab_best = ab_grid.best_estimator_
ab_pred = ab_best.predict(X_test)

ab_results = {
    'accuracy': accuracy_score(y_test, ab_pred),
    'precision': precision_score(y_test, ab_pred, pos_label='g'),
    'recall': recall_score(y_test, ab_pred, pos_label='g'),
    'f1': f1_score(y_test, ab_pred, pos_label='g'),
    'confusion_matrix': confusion_matrix(y_test, ab_pred, labels=['g', 'h']).tolist()
}

print("AdaBoost:")
print(f"Accuracy: {ab_results['accuracy']:.4f}")
print(f"Precision: {ab_results['precision']:.4f}")
print(f"Recall: {ab_results['recall']:.4f}")
print(f"F1-Score: {ab_results['f1']:.4f}")
print(f"Confusion Matrix: {ab_results['confusion_matrix']}")

AdaBoost:
Accuracy: 0.8114
Precision: 0.8089
Recall: 0.8142
F1-Score: 0.8116
Confusion Matrix: [[1630, 372], [385, 1626]]


# **Comparison Between Each Model**

In [9]:
# Models Comparison:
print("\nComparison:")
print("Decision Tree: Simple model, prone to overfitting, moderate performance.")
print("Naive Bayes: Assumes feature independence, fast but lower accuracy.")
print("Random Forest: Ensemble of trees, robust, high accuracy after tuning.")
print("AdaBoost: Boosting technique, good performance, sensitive to noise.")


Comparison:
Decision Tree: Simple model, prone to overfitting, moderate performance.
Naive Bayes: Assumes feature independence, fast but lower accuracy.
Random Forest: Ensemble of trees, robust, high accuracy after tuning.
AdaBoost: Boosting technique, good performance, sensitive to noise.
