<a href="https://colab.research.google.com/github/GuyMeron1/ML-MAGIC-Gamma-Telescope/blob/main/first_try_in_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports:

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Load and Preview Dataset:

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConcl", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]

df = pd.read_csv("magic04.data", names=cols)
df.head()

Encode Target Variable:

In [None]:
df["class"] = (df["class"] == "g").astype(int)
df.head()

Feature Distributions by Class:

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color="blue", label="gamma", alpha=0.7, density=True)
  plt.hist(df[df["class"]==0][label], color="red", label="hadron", alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

Split Dataset into Train, Validation, and Test Sets:

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])
"""shuffled_df = df.sample(frac=1, random_state=42)  # optional: set seed for reproducibility
train = shuffled_df.iloc[:int(0.6*len(df))]
valid = shuffled_df.iloc[int(0.6*len(df)):int(0.8*len(df))]
test  = shuffled_df.iloc[int(0.8*len(df)):]"""


Scale and Optionally Oversample Dataset:

In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  Y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, Y = ros.fit_resample(X, Y)

  data = np.hstack((X, np.reshape(Y, (-1, 1))))

  return data, X, Y

Apply Scaling and Oversampling to Splits:

In [None]:
train, X_train, Y_train = scale_dataset(train, oversample=True)
valid, X_valid, Y_valid = scale_dataset(valid, oversample=False)
test, X_test, Y_test = scale_dataset(test, oversample=False)

# **KNN:**

In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [47]:
param_grid = {
    'n_neighbors': range(1, 102, 10),
    'p': [1, 2]  # p=1 for Manhattan distance, p=2 for Euclidean distance
}
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'n_neighbors': 1, 'p': 1}
Best cross-validation score: 0.8971


In [50]:
best_model = grid_search.best_estimator_

In [51]:
Y_pred = best_model.predict(X_test)

In [52]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.77      0.69      0.73      1309
           1       0.85      0.89      0.87      2495

    accuracy                           0.82      3804
   macro avg       0.81      0.79      0.80      3804
weighted avg       0.82      0.82      0.82      3804



# **Naive Bayes**:

In [43]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [44]:
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]  # Smoothing parameter
}

# Create and train the Naive Bayes model
nb = GaussianNB()

# Perform grid search with cross-validation
grid_search = GridSearchCV(nb, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'var_smoothing': 1e-09}
Best cross-validation score: 0.6566


In [39]:
best_model = grid_search.best_estimator_

In [40]:
Y_pred = best_model.predict(X_test)

In [41]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.65      0.40      0.50      1309
           1       0.74      0.89      0.81      2495

    accuracy                           0.72      3804
   macro avg       0.70      0.65      0.65      3804
weighted avg       0.71      0.72      0.70      3804



# **Desecion tree:**

In [82]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [101]:
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 3,4,5,6],  # Maximum depth of the tree
    'min_samples_split': [4, 5, 6],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 3, 4]  # Minimum number of samples required to be at a leaf node
}

dt = DecisionTreeClassifier()

grid_search = GridSearchCV(dt, param_grid, cv=5)
grid_search.fit(X_train, Y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 4}
Best cross-validation score: 1.0000


In [98]:
best_model = grid_search.best_estimator_

In [99]:
Y_pred = best_model.predict(X_test)

In [100]:
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        26

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

