In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [ ]:
data = pd.read_csv("./Dataset/law.csv")

In [ ]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

data[['LSAT', 'UGPA']] = scaler.fit_transform(data[['LSAT', 'UGPA']])

In [ ]:
from sklearn.model_selection import train_test_split
    
train,test = train_test_split(data, test_size = 0.3, random_state = 0)

In [ ]:
test_CT = test.copy()
test_CT['gender'] = 1 - test_CT['gender']

In [ ]:
train.to_csv('./Dataset/law_train.csv', index=False)
test.to_csv('./Dataset/law_test.csv', index=False)
test_CT.to_csv('./Dataset/law_test_CT.csv', index=False)
test.to_csv('./Dataset/law_test_index.csv', index=True)

In [ ]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

train_all = h2o.upload_file("./Dataset/law_train.csv")
test_all = h2o.upload_file("./Dataset/law_test.csv")
test_all_CT = h2o.upload_file("./Dataset/law_test_CT.csv")

x = train_all.columns
y = "ZFYA"
x.remove(y)

train_all[y] = train_all[y].asnumeric()
test_all[y] = test_all[y].asnumeric()
test_all_CT[y] = test_all_CT[y].asnumeric()

aml_all = H2OAutoML(nfolds=10, max_models=20, seed=1, max_runtime_secs=120 ,include_algos = ["GLM", "DeepLearning", "DRF", "GBM", "StackedEnsemble"])
aml_all.train(x=x, y=y, training_frame=train_all)

lb_all = aml_all.leaderboard
lb_all.head(rows=lb_all.nrows) 

In [ ]:
h2o.export_file(lb_all, path = "./leadboard_law_all.csv", force = True)

In [ ]:
import pandas as pd

file_path = './leadboard_law_all.csv'  
data = pd.read_csv(file_path)

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names:
    model = h2o.get_model(model_name)
    h2o.save_model(model=model, path="./Law_Model/", force=True)
    
    preds = model.predict(test_all)
    preds_CT = model.predict(test_all_CT)

    h2o.export_file(preds, path = f"./Law_Res/{model_name}.csv", force = True)
    h2o.export_file(preds_CT, path = f"./Law_Res/{model_name}_CT.csv", force = True)

In [ ]:
import pandas as pd

CT_res_all = pd.DataFrame()

import pandas as pd

file_path = './leadboard_law_all.csv'  
data = pd.read_csv(file_path)

df = pd.read_csv('./Dataset/law_test_index.csv')
first_column = df.iloc[:, 0]

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names: 
    # Load the files
    model_ct = pd.read_csv(f'./Law_Res/{model_name}_CT.csv')
    model = pd.read_csv(f'./Law_Res/{model_name}.csv')
    CT_res = abs((model['predict']-model_ct['predict'])/model['predict'])
    CT_res_rounded = CT_res.round(5)
    
    CT_res_all[f'{model_name}'] = CT_res_rounded


CT_res_all.insert(0, 'ID', first_column)

output_path = './Law_Pij.csv'
CT_res_all.to_csv(output_path, index=False)