In [1]:
import numpy as np
import pandas as pd 

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from itertools import product
from tqdm.notebook import tqdm

2022-05-27 16:50:12.464752: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-27 16:50:12.464774: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
total_embeddings = pd.read_csv("total_embeddings_with_bf_p1q10_courses.csv")
total_embeddings.drop("Unnamed: 0",axis=1,inplace=True)
total_embeddings["Escuela"] = total_embeddings["Escuela"].astype(int)

In [3]:
unique_courses = len(total_embeddings.loc[(total_embeddings["class_classif"]!="Missing")&(total_embeddings["class_classif"]!="Intergroup"),["Escuela","class_classif"]].value_counts())
unique_schools = total_embeddings["Escuela"].nunique()
factor = 10
n_sim = unique_courses*factor #

### Standard prediction, random chosen at the edges for each high school

In [4]:
acc_clf_auc = np.zeros((n_sim))
acc_ann_auc = np.zeros((n_sim))
for i in tqdm(range(n_sim)):
    tr_label = i%unique_schools
    X = total_embeddings[total_embeddings["Escuela"] == tr_label].drop(["Escuela","weight",'class_classif'],axis=1).values
    y = total_embeddings[total_embeddings["Escuela"] == tr_label]["weight"].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    sc = MinMaxScaler()
    sc.fit(X_train)
    emb_x_train = sc.transform(X_train)
    emb_y_train = y_train
    emb_x_test = sc.transform(X_test)
    emb_y_test = y_test
    ros = SMOTE(random_state=0,sampling_strategy="minority")
    emb_x_resampled, emb_y_resampled = ros.fit_resample(emb_x_train, emb_y_train)
    clf = RandomForestClassifier(max_depth=7,class_weight="balanced")
    clf.fit(emb_x_resampled,emb_y_resampled)
    acc_clf_auc[int(i)] = roc_auc_score(emb_y_test,clf.predict(emb_x_test))
    #######
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128,activation="relu",input_shape=(emb_x_train.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64,activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(32,activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(8,activation="relu"),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1,activation="sigmoid")
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10e-5),
             loss="binary_crossentropy",
                 metrics=["AUC"])
    model_history = model.fit(emb_x_resampled,emb_y_resampled,epochs=250,verbose=0,batch_size=128,
                             #callbacks=[tf.keras.callbacks.EarlyStopping(monitor="auc",patience=50,)])
                             )
    #######º
    acc_ann_auc[int(i)] = roc_auc_score(emb_y_test,model.predict(emb_x_test))

  0%|          | 0/390 [00:00<?, ?it/s]

2022-05-27 16:50:18.903783: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-27 16:50:18.903811: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-27 16:50:18.903834: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iT28200): /proc/driver/nvidia/version does not exist
2022-05-27 16:50:18.904027: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Saving the AUC result

In [5]:
names = ["acc_clf_auc","acc_ann_auc"]
with open("p1q10_accuracies_global_nsim_randomized.txt", "w") as f:
    for i,lista in enumerate([acc_clf_auc,acc_ann_auc]):
        f.write(names[i]+"\n")
        for elem in lista:
            f.write(str(elem) +"\n")

### Visualization of auc

In [6]:
print("************************")
print(f"The mean is {np.mean(acc_clf_auc) :2.3f} for the Random Forest and the std is {np.std(acc_clf_auc) :2.3f}")
print(f"The mean is {np.mean(acc_ann_auc) :2.3f} for the Random Forest and the std is {np.std(acc_ann_auc) :2.3f}")

************************
The mean is 0.747 for the Random Forest and the std is 0.019
The mean is 0.772 for the Random Forest and the std is 0.039


In [7]:
acc_clf_auc_classes = np.zeros((n_sim))
acc_ann_auc_classes = np.zeros((n_sim))
k_group = 0
for i in tqdm(range(unique_schools*factor)):
    tr_label = i%unique_schools
    grupos = list(total_embeddings[total_embeddings["Escuela"] == tr_label]["class_classif"].unique())
    if "Intergroup" in grupos:
        grupos.remove("Intergroup")
    if "Missing" in grupos:
        grupos.remove("Missing")
    for grupo in grupos:
        X = total_embeddings[total_embeddings["Escuela"] == tr_label].drop(["Escuela","weight"],axis=1)
        y = total_embeddings[total_embeddings["Escuela"] == tr_label][["weight","class_classif"]]
        X_train = X[X["class_classif"] != grupo].drop("class_classif",axis=1).values
        X_test = X[X["class_classif"] == grupo].drop("class_classif",axis=1).values
        y_train = y[y["class_classif"] != grupo].drop("class_classif",axis=1).values
        y_test = y[y["class_classif"] == grupo].drop("class_classif",axis=1).values
        sc = MinMaxScaler()
        sc.fit(X_train)
        emb_x_train = sc.transform(X_train)
        emb_y_train = y_train
        emb_x_test = sc.transform(X_test)
        emb_y_test = y_test
        ros = SMOTE(random_state=0,sampling_strategy="minority")
        emb_x_resampled, emb_y_resampled = ros.fit_resample(emb_x_train, emb_y_train)
        clf = RandomForestClassifier(max_depth=7,class_weight="balanced")
        clf.fit(emb_x_resampled,emb_y_resampled)
        try:
            acc_clf_auc_classes[k_group] = roc_auc_score(emb_y_test,clf.predict(emb_x_test))
        except:
            acc_clf_auc_classes[k_group] = "Homogeneous"
        #######
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128,activation="relu",input_shape=(emb_x_train.shape[1],)),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(64,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(32,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(8,activation="relu"),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(1,activation="sigmoid")
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10e-5),
                 loss="binary_crossentropy",
                     metrics=["AUC"])
        model_history = model.fit(emb_x_resampled,emb_y_resampled,epochs=250,verbose=0,batch_size=128,
                                 #callbacks=[tf.keras.callbacks.EarlyStopping(patience=50)])
                                 )
        #######º
        try:
            acc_ann_auc_classes[k_group] = roc_auc_score(emb_y_test,model.predict(emb_x_test))
        except:
            acc_ann_auc_classes[k_group] = "Homogeneous"
        k_group += 1
        

  0%|          | 0/130 [00:00<?, ?it/s]

total_elem = []
for lista in acc_clf_auc_classes:
    for elem in lista:
        if elem != "Homogeneous":
            total_elem.append(elem)
print(f"La media es {np.mean(total_elem) :3.2f} y la std es {np.std(total_elem):3.2f} ")

total_elem_2 = []
for lista in acc_ann_auc_classes:
    for elem in lista:
        if elem != "Homogeneous":
            total_elem_2.append(elem)
print(f"La media es {np.mean(total_elem_2) :3.2f} y la std es {np.std(total_elem_2):3.2f} ")

In [8]:
names = ["acc_clf_auc_classes","acc_ann_auc_classes"]
with open("p1q0_accuracies_courses_nsim_randomized.txt", "w") as f:
    for i,lista in enumerate([acc_clf_auc_classes,acc_ann_auc_classes]):
        f.write(names[i]+"\n")
        for elem in lista:
            f.write(str(elem) +"\n")