In [1]:
import numpy as np
import pandas as pd 
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from itertools import product
from tqdm.notebook import tqdm



In [2]:
def balanced_accuracy(y_true,y_pred):
    """Returns 0.5*sum_{for each class}(TPR)"""
    y_true = y_true.flatten()
    y_pred = y_pred.flatten()
    clases = list(set(y_true))
    #return sum([sum((y_true == y_pred)&(y_true == clase))/np.count_nonzero(y_true==clase) for clase in clases])/len(clases)
    #non - pythonic, do it clearly
    tpr_total = 0 
    for clase in clases: 
        tpr = sum((y_true == y_pred)&(y_true == clase))/np.count_nonzero(y_true==clase)
        tpr_total += tpr
    
    return tpr_total/len(clases)

In [3]:
total_embeddings = pd.read_csv("total_embeddings_with_bf_p1q10_courses.csv")
total_embeddings.drop("Unnamed: 0",axis=1,inplace=True)
total_embeddings["Escuela"] = total_embeddings["Escuela"].astype(int)

In [4]:
unique_courses = len(total_embeddings.loc[(total_embeddings["class_classif"]!="Missing")&(total_embeddings["class_classif"]!="Intergroup"),["Escuela","class_classif"]].value_counts())
unique_schools = total_embeddings["Escuela"].nunique()
factor = 10
n_sim = unique_courses*factor #

In [5]:
checkpoint_filepath = '/tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_auc',
    mode='max',
    save_best_only=True)

### Standard prediction, random chosen at the edges for each high school

In [6]:
acc_clf_auc = np.zeros((n_sim))
acc_ann_auc = np.zeros((n_sim))
for i in range(n_sim):
    tr_label = i%unique_schools
    X = total_embeddings[total_embeddings["Escuela"] == tr_label].drop(["Escuela","weight",'class_classif'],axis=1).values
    y = total_embeddings[total_embeddings["Escuela"] == tr_label]["weight"].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    sc = MinMaxScaler()
    sc.fit(X_train)
    emb_x_train = sc.transform(X_train)
    emb_y_train = y_train
    emb_x_test = sc.transform(X_test)
    emb_y_test = y_test
    ros = SMOTE(random_state=0,sampling_strategy="minority")
    emb_x_resampled, emb_y_resampled = ros.fit_resample(emb_x_train, emb_y_train)
    clf = RandomForestClassifier(max_depth=7,class_weight="balanced")
    clf.fit(emb_x_resampled,emb_y_resampled)
    #acc_clf_auc[int(i)] = roc_auc_score(emb_y_test,clf.predict(emb_x_test))
    acc_clf_auc[int(i)] = balanced_accuracy(emb_y_test,clf.predict(emb_x_test))
    #######
    checkpoint_filepath = '/tmp/checkpoint'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_auc',
        mode='max',
        save_best_only=True)
    #######
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128,activation="relu",input_shape=(emb_x_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64,activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32,activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(8,activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1,activation="sigmoid")
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
         loss="binary_crossentropy",
             metrics=["AUC"])

    model_history = model.fit(emb_x_resampled,emb_y_resampled,epochs=500,verbose=0,batch_size=128,
                              validation_data = (emb_x_test,emb_y_test),
                             callbacks=[model_checkpoint_callback])

    model.load_weights(checkpoint_filepath)
                             
    #######
    #acc_ann_auc[int(i)] = roc_auc_score(emb_y_test,model.predict(emb_x_test))
    acc_ann_auc[int(i)] = balanced_accuracy(emb_y_test,np.round(model.predict(emb_x_test,verbose=0)))
    
    print(f" Epoch : {int(i)}, \n RF: {acc_clf_auc[int(i)]:.3f} NN: {acc_ann_auc[int(i)]:.3f}")



 Epoch : 0, 
 RF: 0.752 NN: 0.750
 Epoch : 1, 
 RF: 0.740 NN: 0.647
 Epoch : 2, 
 RF: 0.734 NN: 0.716
 Epoch : 3, 
 RF: 0.753 NN: 0.777
 Epoch : 4, 
 RF: 0.744 NN: 0.771
 Epoch : 5, 
 RF: 0.761 NN: 0.758
 Epoch : 6, 
 RF: 0.792 NN: 0.810
 Epoch : 7, 
 RF: 0.763 NN: 0.730
 Epoch : 8, 
 RF: 0.710 NN: 0.725
 Epoch : 9, 
 RF: 0.747 NN: 0.699
 Epoch : 10, 
 RF: 0.728 NN: 0.720
 Epoch : 11, 
 RF: 0.729 NN: 0.710
 Epoch : 12, 
 RF: 0.742 NN: 0.763
 Epoch : 13, 
 RF: 0.741 NN: 0.719
 Epoch : 14, 
 RF: 0.732 NN: 0.670
 Epoch : 15, 
 RF: 0.741 NN: 0.729
 Epoch : 16, 
 RF: 0.762 NN: 0.780
 Epoch : 17, 
 RF: 0.737 NN: 0.766
 Epoch : 18, 
 RF: 0.768 NN: 0.743
 Epoch : 19, 
 RF: 0.781 NN: 0.807
 Epoch : 20, 
 RF: 0.754 NN: 0.756
 Epoch : 21, 
 RF: 0.716 NN: 0.739
 Epoch : 22, 
 RF: 0.757 NN: 0.780
 Epoch : 23, 
 RF: 0.746 NN: 0.728
 Epoch : 24, 
 RF: 0.711 NN: 0.707
 Epoch : 25, 
 RF: 0.751 NN: 0.783
 Epoch : 26, 
 RF: 0.740 NN: 0.717
 Epoch : 27, 
 RF: 0.745 NN: 0.693
 Epoch : 28, 
 RF: 0.728 NN: 0

 Epoch : 231, 
 RF: 0.724 NN: 0.721
 Epoch : 232, 
 RF: 0.725 NN: 0.701
 Epoch : 233, 
 RF: 0.754 NN: 0.771
 Epoch : 234, 
 RF: 0.749 NN: 0.724
 Epoch : 235, 
 RF: 0.759 NN: 0.689
 Epoch : 236, 
 RF: 0.733 NN: 0.734
 Epoch : 237, 
 RF: 0.767 NN: 0.773
 Epoch : 238, 
 RF: 0.741 NN: 0.758
 Epoch : 239, 
 RF: 0.770 NN: 0.751
 Epoch : 240, 
 RF: 0.781 NN: 0.789
 Epoch : 241, 
 RF: 0.740 NN: 0.727
 Epoch : 242, 
 RF: 0.708 NN: 0.712
 Epoch : 243, 
 RF: 0.764 NN: 0.803
 Epoch : 244, 
 RF: 0.752 NN: 0.718
 Epoch : 245, 
 RF: 0.720 NN: 0.709
 Epoch : 246, 
 RF: 0.748 NN: 0.791
 Epoch : 247, 
 RF: 0.752 NN: 0.770
 Epoch : 248, 
 RF: 0.740 NN: 0.663
 Epoch : 249, 
 RF: 0.730 NN: 0.739
 Epoch : 250, 
 RF: 0.765 NN: 0.772
 Epoch : 251, 
 RF: 0.741 NN: 0.768
 Epoch : 252, 
 RF: 0.761 NN: 0.754
 Epoch : 253, 
 RF: 0.775 NN: 0.771
 Epoch : 254, 
 RF: 0.739 NN: 0.759
 Epoch : 255, 
 RF: 0.714 NN: 0.766
 Epoch : 256, 
 RF: 0.774 NN: 0.770
 Epoch : 257, 
 RF: 0.730 NN: 0.738
 Epoch : 258, 
 RF: 0.719 NN

### Saving the AUC result

In [7]:
names = ["acc_clf_auc","acc_ann_auc"]
with open("p1q10_accuracies_global_nsim_randomized_miguelmetric_2.txt", "w") as f:
    for i,lista in enumerate([acc_clf_auc,acc_ann_auc]):
        f.write(names[i]+"\n")
        for elem in lista:
            f.write(str(elem) +"\n")

### Visualization of auc

In [8]:
print("************************")
print(f"The mean is {np.mean(acc_clf_auc) :2.3f} for the Random Forest and the std is {np.std(acc_clf_auc) :2.3f}")
print(f"The mean is {np.mean(acc_ann_auc) :2.3f} for the Neural network and the std is {np.std(acc_ann_auc) :2.3f}")

************************
The mean is 0.746 for the Random Forest and the std is 0.020
The mean is 0.745 for the Neural network and the std is 0.033


In [6]:
acc_clf_auc_classes = np.zeros((n_sim))
acc_ann_auc_classes = np.zeros((n_sim))
k_group = 0
for i in range(unique_schools*factor):
    tr_label = i%unique_schools
    grupos = list(total_embeddings[total_embeddings["Escuela"] == tr_label]["class_classif"].unique())
    if "Intergroup" in grupos:
        grupos.remove("Intergroup")
    if "Missing" in grupos:
        grupos.remove("Missing")
    for grupo in grupos:
        X = total_embeddings[total_embeddings["Escuela"] == tr_label].drop(["Escuela","weight"],axis=1)
        y = total_embeddings[total_embeddings["Escuela"] == tr_label][["weight","class_classif"]]
        X_train = X[X["class_classif"] != grupo].drop("class_classif",axis=1).values
        X_test = X[X["class_classif"] == grupo].drop("class_classif",axis=1).values
        y_train = y[y["class_classif"] != grupo].drop("class_classif",axis=1).values
        y_test = y[y["class_classif"] == grupo].drop("class_classif",axis=1).values
        sc = MinMaxScaler()
        sc.fit(X_train)
        emb_x_train = sc.transform(X_train)
        emb_y_train = y_train
        emb_x_test = sc.transform(X_test)
        emb_y_test = y_test
        ros = SMOTE(random_state=0,sampling_strategy="minority")
        emb_x_resampled, emb_y_resampled = ros.fit_resample(emb_x_train, emb_y_train)
        clf = RandomForestClassifier(max_depth=7,class_weight="balanced")
        clf.fit(emb_x_resampled,emb_y_resampled)
        try:
            #acc_clf_auc_classes[k_group] = roc_auc_score(emb_y_test,clf.predict(emb_x_test))
            acc_clf_auc_classes[k_group] = balanced_accuracy(emb_y_test,clf.predict(emb_x_test))
        except:
            acc_clf_auc_classes[k_group] = -1
        #######
        checkpoint_filepath = '/tmp/checkpoint'
        model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_filepath,
            save_weights_only=True,
            monitor='val_auc',
            mode='max',
            save_best_only=True)
    #######
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128,activation="relu",input_shape=(emb_x_train.shape[1],)),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(64,activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(32,activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(8,activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(1,activation="sigmoid")
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                 loss="binary_crossentropy",
                     metrics=["AUC"])
        model_history = model.fit(emb_x_resampled,emb_y_resampled,epochs=500,verbose=0,batch_size=128,
                                  validation_data = (emb_x_test,emb_y_test),
                                 callbacks=[model_checkpoint_callback])

        model.load_weights(checkpoint_filepath)
        #######ยบ
        try:
            #acc_ann_auc_classes[k_group] = roc_auc_score(emb_y_test,model.predict(emb_x_test))
            acc_ann_auc_classes[k_group] = balanced_accuracy(emb_y_test,np.round(model.predict(emb_x_test,verbose=0)))
        except:
            acc_ann_auc_classes[k_group] = -1
        ######
        print(f" Epoch : {k_group}, \n RF: {acc_clf_auc_classes[k_group]:.3f} NN: {acc_ann_auc_classes[k_group]:.3f}")
        
        k_group += 1

 Epoch : 0, 
 RF: 0.506 NN: 0.539
 Epoch : 1, 
 RF: 0.560 NN: 0.541
 Epoch : 2, 
 RF: 0.571 NN: 0.557
 Epoch : 3, 
 RF: 0.516 NN: 0.521
 Epoch : 4, 
 RF: 0.628 NN: 0.631
 Epoch : 5, 
 RF: 0.500 NN: 0.500
 Epoch : 6, 
 RF: 0.515 NN: 0.518
 Epoch : 7, 
 RF: 0.497 NN: 0.519
 Epoch : 8, 
 RF: 0.554 NN: 0.513
 Epoch : 9, 
 RF: 0.521 NN: 0.571
 Epoch : 10, 
 RF: 0.657 NN: 0.646
 Epoch : 11, 
 RF: 0.618 NN: 0.652
 Epoch : 12, 
 RF: 0.653 NN: 0.660
 Epoch : 13, 
 RF: 0.659 NN: 0.655
 Epoch : 14, 
 RF: 0.675 NN: 0.679
 Epoch : 15, 
 RF: 0.713 NN: 0.693
 Epoch : 16, 
 RF: 0.637 NN: 0.610
 Epoch : 17, 
 RF: 0.612 NN: 0.571
 Epoch : 18, 
 RF: 0.428 NN: 0.491
 Epoch : 19, 
 RF: 0.723 NN: 0.505
 Epoch : 20, 
 RF: 0.525 NN: 0.500
 Epoch : 21, 
 RF: 0.631 NN: 0.593
 Epoch : 22, 
 RF: 0.602 NN: 0.599
 Epoch : 23, 
 RF: 0.540 NN: 0.602
 Epoch : 24, 
 RF: 0.582 NN: 0.562
 Epoch : 25, 
 RF: 0.631 NN: 0.645
 Epoch : 26, 
 RF: 0.623 NN: 0.630
 Epoch : 27, 
 RF: 0.638 NN: 0.595
 Epoch : 28, 
 RF: 0.598 NN: 0

 Epoch : 231, 
 RF: 0.522 NN: 0.599
 Epoch : 232, 
 RF: 0.647 NN: 0.680
 Epoch : 233, 
 RF: 0.533 NN: 0.646
 Epoch : 234, 
 RF: 0.513 NN: 0.542
 Epoch : 235, 
 RF: 0.559 NN: 0.576
 Epoch : 236, 
 RF: 0.586 NN: 0.560
 Epoch : 237, 
 RF: 0.531 NN: 0.511
 Epoch : 238, 
 RF: 0.621 NN: 0.499
 Epoch : 239, 
 RF: 0.497 NN: 0.521
 Epoch : 240, 
 RF: 0.478 NN: 0.503
 Epoch : 241, 
 RF: 0.500 NN: 0.502
 Epoch : 242, 
 RF: 0.536 NN: 0.568
 Epoch : 243, 
 RF: 0.532 NN: 0.588
 Epoch : 244, 
 RF: 0.658 NN: 0.646
 Epoch : 245, 
 RF: 0.601 NN: 0.698
 Epoch : 246, 
 RF: 0.665 NN: 0.655
 Epoch : 247, 
 RF: 0.630 NN: 0.648
 Epoch : 248, 
 RF: 0.675 NN: 0.682
 Epoch : 249, 
 RF: 0.714 NN: 0.678
 Epoch : 250, 
 RF: 0.644 NN: 0.660
 Epoch : 251, 
 RF: 0.594 NN: 0.506
 Epoch : 252, 
 RF: 0.483 NN: 0.496
 Epoch : 253, 
 RF: 0.614 NN: 0.601
 Epoch : 254, 
 RF: 0.536 NN: 0.527
 Epoch : 255, 
 RF: 0.641 NN: 0.624
 Epoch : 256, 
 RF: 0.614 NN: 0.621
 Epoch : 257, 
 RF: 0.542 NN: 0.620
 Epoch : 258, 
 RF: 0.570 NN

In [8]:
names = ["acc_clf_auc_classes","acc_ann_auc_classes"]
with open("p1q0_accuracies_courses_nsim_randomized_miguelmetric_2.txt", "w") as f:
    for i,lista in enumerate([acc_clf_auc_classes,acc_ann_auc_classes]):
        f.write(names[i]+"\n")
        for elem in lista:
            f.write(str(elem) +"\n")