# 1. Data Balancing 

In [48]:
import pandas as pd
from sklearn.utils import shuffle

In [8]:
# Load dataset
columns = ["feature1", "feature2", "feature3", "feature4", "feature5", "feature6",
           "feature7", "feature8", "feature9", "feature10", "class"]
data=pd.read_csv("magic04.data",names=columns)
print(data.head())

   feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0   28.7967   16.0021    2.6449    0.3918    0.1982   27.7004   22.0110   
1   31.6036   11.7235    2.5185    0.5303    0.3773   26.2722   23.8238   
2  162.0520  136.0310    4.0612    0.0374    0.0187  116.7410  -64.8580   
3   23.8172    9.5728    2.3385    0.6147    0.3922   27.2107   -6.4633   
4   75.1362   30.9205    3.1611    0.3168    0.1832   -5.5277   28.5525   

   feature8  feature9  feature10 class  
0   -8.2027   40.0920    81.8828     g  
1   -9.9574    6.3609   205.2610     g  
2  -45.2160   76.9600   256.7880     g  
3   -7.1513   10.4490   116.7370     g  
4   21.8393    4.6480   356.4620     g  


In [10]:
# Separate classes
gamma_data = data[data["class"] == "g"]
hadron_data = data[data["class"] == "h"]
# Match the size of the smaller class (hadron: 6688)
gamma_sampled = gamma_data.sample(n=len(hadron_data), random_state=42)
# Concatenate balanced data
data_balanced=pd.concat([gamma_sampled, hadron_data], axis=0)
# Shuffle the rows
data_balanced = shuffle(data_balanced, random_state=42).reset_index(drop=True)
# Encode class label (gamma: 1, hadron: 0)
from sklearn.preprocessing import LabelEncoder
data_balanced["class"] = LabelEncoder().fit_transform(data_balanced["class"])

print(data_balanced["class"].value_counts())


class
1    6688
0    6688
Name: count, dtype: int64


# 2. Data Split

In [13]:
from sklearn.model_selection import train_test_split
 
# Features (X) and target (y)
x=data_balanced.drop("class", axis=1)
y=data_balanced["class"]

# Split the dataset (70% train, 30% test)
X_train, X_test, y_train, y_test =train_test_split(x,y, test_size=0.3,random_state=42,stratify=y)

# Verify the shape of the split data
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

Training set: (9363, 10), Testing set: (4013, 10)


# 3. Classification 

 (a) Decision Tree 

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the Decision Tree Classifier
df_model=DecisionTreeClassifier(random_state=42)

# Train the model
df_model.fit(X_train,y_train)

# Predict and evaluate
dt_predictions = df_model.predict(X_test)

dt_accuracy = accuracy_score(y_test, dt_predictions)

print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

Decision Tree Accuracy: 0.7982


(b) AdaBoost 

In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
# Initialize the AdaBoost Classifier
ada_model = AdaBoostClassifier(random_state=42)

# Set up the parameter grid for n_estimators
param_grid_ada = {'n_estimators': [50, 100, 150, 200]}
# Grid search for hyperparameter tuning
grid_search_ada = GridSearchCV(ada_model, param_grid_ada, cv=5, scoring='accuracy')
grid_search_ada.fit(X_train, y_train)

# Best model and performance
best_ada_model = grid_search_ada.best_estimator_
ada_predictions = best_ada_model.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_predictions)
print(f"AdaBoost Accuracy: {ada_accuracy:.4f}")
print(f"Best n_estimators for AdaBoost: {grid_search_ada.best_params_['n_estimators']}")



AdaBoost Accuracy: 0.8243
Best n_estimators for AdaBoost: 200


(c) Random Forests 

In [30]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Set up the parameter grid for n_estimators
param_grid_rf = {'n_estimators': [50, 100, 150, 200]}

# Grid search for hyperparameter tuning
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best model and performance
best_rf_model = grid_search_rf.best_estimator_
rf_predictions = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Best n_estimators for Random Forest: {grid_search_rf.best_params_['n_estimators']}")

Random Forest Accuracy: 0.8634
Best n_estimators for Random Forest: 200


(d) Na¨ıve Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
# Initialize the Naive Bayes Classifier
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Predict and evaluate
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

Naive Bayes Accuracy: 0.6566


Compare Performance

In [36]:
from sklearn.metrics import classification_report
print("Model Comparison:\n")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"AdaBoost Accuracy: {ada_accuracy:.4f}")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# Optionally, print detailed classification reports for each model
print("\nDecision Tree Classification Report:\n", classification_report(y_test, dt_predictions))
print("\nAdaBoost Classification Report:\n", classification_report(y_test, ada_predictions))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_predictions))
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

Model Comparison:

Decision Tree Accuracy: 0.7982
AdaBoost Accuracy: 0.8243
Random Forest Accuracy: 0.8634
Naive Bayes Accuracy: 0.6566

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80      2007
           1       0.80      0.80      0.80      2006

    accuracy                           0.80      4013
   macro avg       0.80      0.80      0.80      4013
weighted avg       0.80      0.80      0.80      4013


AdaBoost Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.83      0.83      2007
           1       0.83      0.82      0.82      2006

    accuracy                           0.82      4013
   macro avg       0.82      0.82      0.82      4013
weighted avg       0.82      0.82      0.82      4013


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87

# 4. Model Parameter Tuning

a- Decision Tree Using cross val. Tune

In [40]:
from sklearn.model_selection import GridSearchCV  

dt_param_grid = {
    'max_depth': [5, 15, 25, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                       dt_param_grid, cv=5, scoring='accuracy')
dt_tuned.fit(X_train, y_train)
dt_tuned_pred = dt_tuned.predict(X_test)
dt_tuned_acc = accuracy_score(y_test, dt_tuned_pred)

print(f"[New] Tuned Decision Tree Accuracy: {dt_tuned_acc:.4f}")
print(f"[New] Best DT Params: {dt_tuned.best_params_}\n")

[New] Tuned Decision Tree Accuracy: 0.7967
[New] Best DT Params: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_split': 2}



b- Naive Bayes - Hyperparameter Tuning

In [50]:
import numpy as np
nb_param_grid = {
    'var_smoothing': np.logspace(0, -12, num=100)
}

nb_tuned = GridSearchCV(
    GaussianNB(),
    nb_param_grid,
    cv=5,
    scoring='accuracy'
)
nb_tuned.fit(X_train, y_train)
print(f"[New] Tuned Naive Bayes Accuracy: {accuracy_score(y_test, nb_tuned.predict(X_test)):.4f}")
print(f"[New] Best NB Params: {nb_tuned.best_params_}\n")

[New] Tuned Naive Bayes Accuracy: 0.6628
[New] Best NB Params: {'var_smoothing': 0.0004037017258596554}



c- AdaBoost - Expanded Tuning

In [54]:
ada_param_grid_ext = {
    'n_estimators': [200, 300],  # Extends original [50, 100, 150, 200]
    'learning_rate': [0.01, 0.1, 1.0]  # New parameter
}

ada_tuned_ext = GridSearchCV(
    AdaBoostClassifier(random_state=42),
    ada_param_grid_ext,
    cv=5,
    scoring='accuracy'
)
ada_tuned_ext.fit(X_train, y_train)
print(f"[New] Extended AdaBoost Accuracy: {accuracy_score(y_test, ada_tuned_ext.predict(X_test)):.4f}")
print(f"[New] Best AdaBoost Params: {ada_tuned_ext.best_params_}\n")




[New] Extended AdaBoost Accuracy: 0.8243
[New] Best AdaBoost Params: {'learning_rate': 1.0, 'n_estimators': 200}



d - Random Forest - Expanded Tuning

In [61]:
rf_param_grid_ext = {
    'n_estimators': [200, 300],  # Extends original [50, 100, 150, 200]
    'max_depth': [15, 30, None],  # New values
    'max_features': ['sqrt', 'log2']  # New parameter
}

rf_tuned_ext = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid_ext,
    cv=5,
    scoring='accuracy'
)
rf_tuned_ext.fit(X_train, y_train)
print(f"[New] Extended Random Forest Accuracy: {accuracy_score(y_test, rf_tuned_ext.predict(X_test)):.4f}")
print(f"[New] Best RF Params: {rf_tuned_ext.best_params_}\n")

[New] Extended Random Forest Accuracy: 0.8580
[New] Best RF Params: {'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 300}

