<a href="https://colab.research.google.com/github/IoanPJ/Final_year_project_IMPJ/blob/main/ML/imbalanced_CL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import imblearn
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, log_loss, roc_auc_score, RocCurveDisplay, roc_curve
import numpy as np
import pandas as pd
#from IMPJ import DataProcessor
import random
import matplotlib.pyplot as plt
from os.path import basename, exists
import matplotlib as mpl
import random
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm

#dp = DataProcessor()

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)

download('https://github.com/AllenDowney/AstronomicalData/raw/main/' + 'az-paper-twocol.mplstyle')
plt.style.use('./az-paper-twocol.mplstyle')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
for run in range(1,11):

    filepath = "/content/drive/My Drive/Colab Notebooks/Fermi-LAT Data/imputed_wrappered_12_withbcus.csv"
    data = pd.read_csv(filepath, index_col=0)
    data = data.dropna()
    fulldata=data
    fulldata['CLASS1'].replace(0,7)
    data=data.loc[data['CLASS1']!=0]
    Y = np.array(data['CLASS1'])
    X = np.array(data.drop(labels='CLASS1',axis=1))

    Y[Y==2]=1 # non radio galaxies
    Y[Y==3]=0 # radio galaxies

    init_ratio = len(Y[Y==0])/len(Y[Y==1])

    sm = SMOTE(random_state=42, sampling_strategy=init_ratio*5)
    ru = RandomUnderSampler(random_state=42, sampling_strategy=1)

    scaler = StandardScaler()

    NN = MLPClassifier(activation='tanh',hidden_layer_sizes=(50,50),learning_rate='adaptive',solver='sgd',
                      max_iter=5000)
    BNN = BaggingClassifier(estimator=NN,n_estimators=5,bootstrap=True,verbose=0,n_jobs=-1)
    iteration=0
    kf = KFold(n_splits=10)

    for train_indices, test_indices in tqdm(kf.split(X)):
        iteration+=1
        x_train=X[train_indices]
        y_train = Y[train_indices]
        x_test=X[test_indices]
        y_test=Y[test_indices]
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)
        x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)
        x_train_res, y_train_res = ru.fit_resample(x_train_sm,y_train_sm)

        # Machine learning training and testing

        BNN.fit(x_train_res, y_train_res)
        y_pred=BNN.predict(x_test)
        y_proba=BNN.predict_proba(x_test)

        # Saving Data


        y_proba0=np.array(y_proba).T[0]
        y_proba1=np.array(y_proba).T[1]
        y_results = pd.DataFrame(y_test,columns=['y_test'])
        y_results['y_pred']=y_pred
        y_results['y_proba_0']=y_proba0
        y_results['y_proba_1']=y_proba1
        y_results['testind']=test_indices
        y_results.to_csv('/content/drive/My Drive/Colab Notebooks/imbalanced/Run_'+str(run)+'/y_results_cvsplit'+str(iteration)+'.csv')


10it [07:57, 47.76s/it]
10it [08:32, 51.27s/it]
10it [07:39, 45.95s/it]
10it [08:25, 50.58s/it]
10it [08:47, 52.75s/it]
10it [07:54, 47.47s/it]
10it [07:47, 46.74s/it]
10it [08:32, 51.28s/it]
10it [07:16, 43.62s/it]
10it [07:58, 47.89s/it]


In [1]:
for run in range(1,11):

    # creating train data (we will now use the entire dataset for training)

    filepath = "/content/drive/My Drive/Colab Notebooks/Fermi-LAT Data/imputed_wrappered_12_withbcus.csv"
    data = pd.read_csv(filepath, index_col=0)
    data=data.dropna()
    fulldata=data
    data=data.loc[data['CLASS1']!=0]
    Y = np.array(data['CLASS1'])
    X = np.array(data.drop(labels='CLASS1',axis=1))

    Y[Y==2]=1 # non radio galaxies
    Y[Y==3]=0 # radio galaxies
    x_train = X
    y_train = Y

    # Scaling and resampling the data

    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)
    x_train_res, y_train_res = ru.fit_resample(x_train_sm,y_train_sm)


    # Creating a BCU only dataset to apply the trained model to

    bcus=fulldata.loc[fulldata['CLASS1']==0]
    x_bcus = np.array(bcus.drop('CLASS1',axis=1))
    # Training and applying the ML model

    NN = MLPClassifier(activation='tanh',hidden_layer_sizes=(50,50),learning_rate='adaptive',solver='sgd',max_iter=5000)
    BNN = BaggingClassifier(estimator=NN,n_estimators=20,bootstrap=True)
    BNN.fit(x_train,y_train)
    y_pred = BNN.predict(x_bcus)
    y_proba = BNN.predict_proba(x_bcus)

    # Saving the results

    y_proba0=np.array(y_proba).T[0]
    y_proba1=np.array(y_proba).T[1]
    y_results = pd.DataFrame()
    y_results['y_pred']=y_pred
    y_results['y_proba_0']=y_proba0
    y_results['y_proba_1']=y_proba1
    y_results.to_csv('/content/drive/My Drive/Colab Notebooks/imbalanced_bcu/Run '+str(run)+'/y_results_BCU.csv')


NameError: name 'pd' is not defined