<a href="https://colab.research.google.com/github/IoanPJ/Final_year_project_IMPJ/blob/main/ML/imbalanced_CL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import imblearn
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import BaggingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, log_loss, roc_auc_score, RocCurveDisplay, roc_curve
import numpy as np
import pandas as pd
#from IMPJ import DataProcessor
import random
import matplotlib.pyplot as plt
from os.path import basename, exists
import matplotlib as mpl
import random
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm

#dp = DataProcessor()

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)

download('https://github.com/AllenDowney/AstronomicalData/raw/main/' + 'az-paper-twocol.mplstyle')
plt.style.use('./az-paper-twocol.mplstyle')

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
test_size = 0.3

filepath = "/content/drive/My Drive/Colab Notebooks/chiaro_12.csv"
data = pd.read_csv(filepath, index_col=0)
data = data.dropna()
Y = np.array(data['CLASS1'])
X = np.array(data.drop(labels='CLASS1',axis=1))
#X,Y = dp.choose_2_vars('CLASS1',data,1,2)

In [36]:
Y[Y==2]=1 # non radio galaxies
Y[Y==3]=0 # radio galaxies

init_ratio = len(Y[Y==0])/len(Y[Y==1])

sm = SMOTE(random_state=42, sampling_strategy=init_ratio*5)
ru = RandomUnderSampler(random_state=42, sampling_strategy=1)

scaler = StandardScaler()


In [53]:
NN = MLPClassifier(activation='tanh',hidden_layer_sizes=(50,50),learning_rate='constant',solver='sgd',
                   max_iter=5000,random_state=5)
BNN = BaggingClassifier(estimator=NN,n_estimators=5,bootstrap=True,verbose=0,n_jobs=-1)
#ovr = OneVsRestClassifier(estimator=BNN, n_jobs=-1,verbose=10)
iteration=0
kf = KFold(n_splits=10)

y_pred=pd.DataFrame()
y_proba=pd.DataFrame()
y_test_df=pd.DataFrame()


for train_indices, test_indices in tqdm(kf.split(X)):
    iteration+=1
    x_train=X[train_indices]
    y_train = Y[train_indices]
    x_test=X[test_indices]
    y_test=Y[test_indices]
    y_test_df['iteration '+str(iteration)] = y_test
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    #print(f'Initial Ratio: {len(y_train[y_train==0])/len(y_train[y_train==1])}')
    #print(f'Original Dataset Shape: {Counter(y_train)}')
    x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)
    x_train_res, y_train_res = ru.fit_resample(x_train_sm,y_train_sm)
    #print(f'New Ratio: {len(y_train_res[y_train_res==0])/len(y_train_res[y_train_res==1])}')
    #print(f'Resampled dataset shape {Counter(y_train_res)}')
    BNN.fit(x_train_res, y_train_res)

    y_pred['iteration '+str(iteration)] = BNN.predict(x_test)
    y_proba['class' + str(0) + ' iteration '+ str(iteration)] = BNN.predict_proba(x_test).T[0]
    y_proba['class'+str(1)+' iteration '+str(iteration)]= BNN.predict_proba(x_test).T[1]
#BNN.fit(x_train_res,y_train_res)
y_pred.to_csv("/content/drive/My Drive/Colab Notebooks/imbalanced_trials/y_pred.csv")
y_proba.to_csv("/content/drive/My Drive/Colab Notebooks/imbalanced_trials/y_proba.csv")
y_test_df.to_csv("/content/drive/My Drive/Colab Notebooks/imbalanced_trials/y_test.csv")

10it [09:21, 56.11s/it]


In [49]:
print(np.unique(y_train))

''' SCORING METRICS '''

accuracy = accuracy_score(y_test, y_pred)*100
confusion = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test,y_pred)
roc_auc_weightedavg = roc_auc_score(y_test,y_pred,average='weighted')
logloss = log_loss(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
fpr, tpr, thresholds = roc_curve(y_test,y_proba.T[1])

print('The Neural Network accuracy is ' + str(accuracy))
print('The Neural Network ROC AUC Scores are: '+str(roc_auc))
print("The Neural Network's Weighted Average ROC AUC Score is: " + str(roc_auc_weightedavg))
print("The Neural Network's Logarithmic Loss Score is: " + str(logloss))
print('The Neural Network F1 Score is: '+str(f1))
print('The Neural Network Confusion Matrix is:')
print(confusion)

resultcols = ['Accuracy', 'ROC AUC', 'ROC Weighted Av', 'Logarithmic Loss',
            'F1 Score', 'CMatrix11','CMatrix12', 'CMatrix21', 'CMatrix22']
resultarray = np.array((accuracy,roc_auc,roc_auc_weightedavg,logloss,f1,
                        confusion[0,0],confusion[0,1],confusion[1,0],confusion[1,1]))
rocresultcols = ['FPR','TPR','Thresholds']
rocresultarray = np.array([fpr,tpr,thresholds])
#print(rocresultarray)
results = pd.DataFrame([resultarray],columns=resultcols)
rocresults = pd.DataFrame(np.array([fpr,tpr,thresholds]).transpose(),columns=rocresultcols)

results.to_csv("/content/drive/My Drive/Colab Notebooks/imbalanced_trials/results1.csv")
rocresults.to_csv("/content/drive/My Drive/Colab Notebooks/imbalanced_trials/rocresults1.csv")


fig, ax = plt.subplots(figsize=(5,5))

ax.plot(fpr,tpr,linestyle='dashed',marker='x')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')

[0 1]


ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets