<a href="https://colab.research.google.com/github/GuyMeron1/ML-MAGIC-Gamma-Telescope/blob/main/first_try_in_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Load and Preview Dataset:

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConcl", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

df = pd.read_csv("magic04.data", names=cols)
df.head()

Encode Target Variable:

In [None]:
df["class"] = (df["class"] == "g").astype(int)
df.head()

Feature Distributions by Class:

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color="blue", label="gamma", alpha=0.7, density=True)
  plt.hist(df[df["class"]==0][label], color="red", label="hadron", alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

Split Dataset into Train, Validation, and Test Sets:

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
"""shuffled_df = df.sample(frac=1, random_state=42)  # optional: set seed for reproducibility
train = shuffled_df.iloc[:int(0.6*len(df))]
valid = shuffled_df.iloc[int(0.6*len(df)):int(0.8*len(df))]
test  = shuffled_df.iloc[int(0.8*len(df)):]"""


Scale and Optionally Oversample Dataset:

In [8]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  Y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, Y = ros.fit_resample(X, Y)

  data = np.hstack((X, np.reshape(Y, (-1, 1))))

  return data, X, Y

Apply Scaling and Oversampling to Splits:

In [9]:
train, X_train, Y_train = scale_dataset(train, oversample=True)
valid, X_valid, Y_valid = scale_dataset(valid, oversample=False)
test, X_test, Y_test = scale_dataset(test, oversample=False)

# **KNN:**

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [25]:
param_grid = {
    'n_neighbors': range(1, 102, 10),
    'p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
}
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'n_neighbors': 1, 'p': 1}
Best cross-validation score: 0.8952


In [26]:
best_model = grid_search.best_estimator_

In [27]:
Y_pred = best_model.predict(X_test)

In [28]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.76      0.68      0.71      1313
           1       0.84      0.89      0.86      2491

    accuracy                           0.81      3804
   macro avg       0.80      0.78      0.79      3804
weighted avg       0.81      0.81      0.81      3804



# **Naive Bayes**:

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [11]:
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]  # Smoothing parameter
}

# Create and train the Naive Bayes model
nb = GaussianNB()

# Perform grid search with cross-validation
grid_search = GridSearchCV(nb, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'var_smoothing': 1e-09}
Best cross-validation score: 0.6477


In [12]:
best_model = grid_search.best_estimator_

In [13]:
Y_pred = best_model.predict(X_test)

In [14]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.68      0.41      0.51      1313
           1       0.74      0.90      0.81      2491

    accuracy                           0.73      3804
   macro avg       0.71      0.65      0.66      3804
weighted avg       0.72      0.73      0.71      3804



# **Desecion tree:**

In [19]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [20]:
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 3,4,5,6],  # Maximum depth of the tree
    'min_samples_split': [4, 5, 6],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 3, 4]  # Minimum number of samples required to be at a leaf node
}

dt = DecisionTreeClassifier()

grid_search = GridSearchCV(dt, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best cross-validation score: 0.8794


In [21]:
best_model = grid_search.best_estimator_

In [22]:
Y_pred = best_model.predict(X_test)

In [23]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73      1313
           1       0.86      0.84      0.85      2491

    accuracy                           0.81      3804
   macro avg       0.79      0.79      0.79      3804
weighted avg       0.81      0.81      0.81      3804



# **SVM:**

In [36]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [37]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 0.01],
    'kernel': ['linear', 'rbf']
}

svm_model = SVC()

grid_search = GridSearchCV(svm_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score: 0.8606


In [42]:
best_svm = grid_search.best_estimator_

In [43]:
Y_pred = best_svm.predict(X_test)

In [44]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81      1313
           1       0.90      0.90      0.90      2491

    accuracy                           0.87      3804
   macro avg       0.86      0.86      0.86      3804
weighted avg       0.87      0.87      0.87      3804



# **Neural Network:**

In [35]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [38]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

mlp = MLPClassifier(max_iter=1000)

grid_search = GridSearchCV(mlp, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'adam'}
Best cross-validation score: 0.8587


In [39]:
best_mlp = grid_search.best_estimator_

In [40]:
Y_pred = best_model.predict(X_test)

In [41]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.76      0.68      0.71      1313
           1       0.84      0.89      0.86      2491

    accuracy                           0.81      3804
   macro avg       0.80      0.78      0.79      3804
weighted avg       0.81      0.81      0.81      3804

