In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("./Dataset/adult.csv")

In [None]:
data[data == '?'] = np.nan

In [None]:
for col in ['workclass', 'occupation', 'native-country']:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [None]:
data.drop(['education'], axis = 1, inplace = True)
data['race'].replace(['Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Black', 'Other'],'Other', inplace = True)

In [None]:
data['income']=data['income'].map({'<=50K': 0, '>50K': 1})
data['gender']=data['gender'].map({'Female': 0, 'Male': 1})
data['race']=data['race'].map({'Other': 0, 'White': 1})

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical = ['workclass', 'marital-status', 'occupation', 'relationship',
               'race','native-country']
label_encoder = LabelEncoder()
for col in categorical:
    label_encoder.fit(data[col])
    data[col] = label_encoder.transform(data[col])

In [None]:
from sklearn.model_selection import train_test_split
    
train,test = train_test_split(data, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

features_to_scale = ['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship',
                     'race', 'native-country', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']


test_CT = test.copy()
test_CT['gender'] = 1 - test_CT['gender']

X_train_scaled = scaler.fit_transform(train[features_to_scale])
X_test_scaled = scaler.transform(test[features_to_scale])
X_test_CT_scaled = scaler.transform(test_CT[features_to_scale])

train_scaled = pd.DataFrame(X_train_scaled, columns=features_to_scale, index=train.index)
test_scaled = pd.DataFrame(X_test_scaled, columns=features_to_scale, index=test.index)
test_CT_scaled = pd.DataFrame(X_test_CT_scaled, columns=features_to_scale, index=test_CT.index)

train_scaled['gender'] = train['gender']
test_scaled['gender'] = test['gender']
test_CT_scaled['gender'] = test_CT['gender']

train_scaled['income'] = train['income']
test_scaled['income'] = test['income']
test_CT_scaled['income'] = test_CT['income']

In [None]:
train_scaled[['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'native-country', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_all_train.csv', index=False)
test_scaled[['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'native-country', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_all_test.csv', index=False)
test_CT_scaled[['workclass', 'educational-num', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'native-country', 'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_all_test_CT.csv', index=False)

In [None]:
train_scaled[['workclass', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'age', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_-1_train.csv', index=False)
test_scaled[['workclass', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'age', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_-1_test.csv', index=False)
test_CT_scaled[['workclass', 'marital-status', 'occupation', 'relationship',
                     'race', 'gender', 'age', 'capital-gain', 'capital-loss', 'hours-per-week','income']].to_csv('./Dataset/adult_-1_test_CT.csv', index=False)

In [None]:
train_scaled[['workclass', 'marital-status', 'relationship',
                     'race', 'gender', 'age', 'hours-per-week','income']].to_csv('./Dataset/adult_-2_train.csv', index=False)
test_scaled[['workclass', 'marital-status', 'relationship',
                     'race', 'gender', 'age', 'hours-per-week','income']].to_csv('./Dataset/adult_-2_test.csv', index=False)
test_CT_scaled[['workclass', 'marital-status', 'relationship',
                     'race', 'gender', 'age', 'hours-per-week','income']].to_csv('./Dataset/adult_-2_test_CT.csv', index=False)

In [None]:
train_scaled[[ 'marital-status', 'relationship',
                      'gender', 'hours-per-week','income']].to_csv('./Dataset/adult_-3_train.csv', index=False)
test_scaled[[ 'marital-status', 'relationship',
                      'gender', 'hours-per-week','income']].to_csv('./Dataset/adult_-3_test.csv', index=False)
test_CT_scaled[[ 'marital-status', 'relationship',
                      'gender', 'hours-per-week','income']].to_csv('./Dataset/adult_-3_test_CT.csv', index=False)

In [None]:
test_scaled.to_csv('./Dataset/adult_test_index.csv', index=True)

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

train_all = h2o.upload_file("./Dataset/adult_all_train.csv")
test_all = h2o.upload_file("./Dataset/adult_all_test.csv")
test_all_CT = h2o.upload_file("./Dataset/adult_all_test_CT.csv")

x = train_all.columns
y = "income"
x.remove(y)

train_all[y] = train_all[y].asfactor()
test_all[y] = test_all[y].asfactor()
test_all_CT[y] = test_all_CT[y].asfactor()

aml_all = H2OAutoML(nfolds=10, max_models=10, seed=1, max_runtime_secs=120 ,include_algos = ["GLM", "DeepLearning", "DRF", "GBM", "StackedEnsemble"])
aml_all.train(x=x, y=y, training_frame=train_all)

lb_all = aml_all.leaderboard
lb_all.head(rows=lb_all.nrows) 

###################

train_1 = h2o.upload_file("./Dataset/adult_-1_train.csv")
test_1 = h2o.upload_file("./Dataset/adult_-1_test.csv")
test_1_CT = h2o.upload_file("./Dataset/adult_-1_test_CT.csv")

x_1 = train_1.columns
y_1 = "income"
x_1.remove(y_1)

train_1[y] = train_1[y].asfactor()
test_1[y] = test_1[y].asfactor()
test_1_CT[y] = test_1_CT[y].asfactor()

aml_1 = H2OAutoML(nfolds=10, max_models=10, seed=1, max_runtime_secs=120 ,include_algos = ["GLM", "DeepLearning", "DRF", "GBM", "StackedEnsemble"])
aml_1.train(x=x_1, y=y_1, training_frame=train_1)

lb_1 = aml_1.leaderboard
lb_1.head(rows=lb_1.nrows) 

###################

train_2 = h2o.upload_file("./Dataset/adult_-2_train.csv")
test_2 = h2o.upload_file("./Dataset/adult_-2_test.csv")
test_2_CT = h2o.upload_file("./Dataset/adult_-2_test_CT.csv")

x_2 = train_2.columns
y_2 = "income"
x_2.remove(y_2)

train_2[y] = train_2[y].asfactor()
test_2[y] = test_2[y].asfactor()
test_2_CT[y] = test_2_CT[y].asfactor()

aml_2 = H2OAutoML(nfolds=10, max_models=10, seed=1, max_runtime_secs=120 ,include_algos = ["GLM", "DeepLearning", "DRF", "GBM", "StackedEnsemble"])
aml_2.train(x=x_2, y=y_2, training_frame=train_2)

lb_2 = aml_2.leaderboard
lb_2.head(rows=lb_2.nrows) 

###################

train_3 = h2o.upload_file("./Dataset/adult_-3_train.csv")
test_3 = h2o.upload_file("./Dataset/adult_-3_test.csv")
test_3_CT = h2o.upload_file("./Dataset/adult_-3_test_CT.csv")

x_3 = train_3.columns
y_3 = "income"
x_3.remove(y_3)

train_3[y] = train_3[y].asfactor()
test_3[y] = test_3[y].asfactor()
test_3_CT[y] = test_3_CT[y].asfactor()

aml_3 = H2OAutoML(nfolds=10, max_models=10, seed=1, max_runtime_secs=120 ,include_algos = ["GLM", "DeepLearning", "DRF", "GBM", "StackedEnsemble"])
aml_3.train(x=x_3, y=y_3, training_frame=train_3)

lb_3 = aml_3.leaderboard
lb_3.head(rows=lb_3.nrows) 

In [None]:
h2o.export_file(lb_all, path = "./leadboard_all.csv", force = True)
h2o.export_file(lb_1, path = "./leadboard_-1.csv", force = True)
h2o.export_file(lb_2, path = "./leadboard_-2.csv", force = True)
h2o.export_file(lb_3, path = "./leadboard_-3.csv", force = True)

In [None]:
import pandas as pd

lb_all = pd.read_csv("./leadboard_all.csv")
lb_1 = pd.read_csv("./leadboard_-1.csv")
lb_2 = pd.read_csv("./leadboard_-2.csv")
lb_3 = pd.read_csv("./leadboard_-3.csv")

combined_df = pd.concat([lb_all, lb_1, lb_2, lb_3], ignore_index=True)

combined_df.to_csv("./leadboard.csv", index=False)

In [None]:
import pandas as pd

file_path = './leadboard_all.csv'  
data = pd.read_csv(file_path)

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names:
    model = h2o.get_model(model_name)
    h2o.save_model(model=model, path="./Adult_Model/", force=True)
    
    preds = model.predict(test_all)
    preds_CT = model.predict(test_all_CT)

    h2o.export_file(preds, path = f"./Adult_Res/{model_name}.csv", force = True)
    h2o.export_file(preds_CT, path = f"./Adult_Res/{model_name}_CT.csv", force = True)

In [None]:
import pandas as pd

file_path = './leadboard_-1.csv'  
data = pd.read_csv(file_path)

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names:
    model = h2o.get_model(model_name)
    h2o.save_model(model=model, path="./Adult_Model/", force=True)
    
    preds = model.predict(test_1)
    preds_CT = model.predict(test_1_CT)

    h2o.export_file(preds, path = f"./Adult_Res/{model_name}.csv", force = True)
    h2o.export_file(preds_CT, path = f"./Adult_Res/{model_name}_CT.csv", force = True)

In [None]:
import pandas as pd

file_path = './leadboard_-2.csv'  
data = pd.read_csv(file_path)

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names:
    model = h2o.get_model(model_name)
    h2o.save_model(model=model, path="./Adult_Model/", force=True)
    
    preds = model.predict(test_2)
    preds_CT = model.predict(test_2_CT)

    h2o.export_file(preds, path = f"./Adult_Res/{model_name}.csv", force = True)
    h2o.export_file(preds_CT, path = f"./Adult_Res/{model_name}_CT.csv", force = True)

In [None]:
import pandas as pd

file_path = './leadboard_-3.csv'  
data = pd.read_csv(file_path)

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names:
    model = h2o.get_model(model_name)
    h2o.save_model(model=model, path="./Adult_Model/", force=True)
    
    preds = model.predict(test_3)
    preds_CT = model.predict(test_3_CT)

    h2o.export_file(preds, path = f"./Adult_Res/{model_name}.csv", force = True)
    h2o.export_file(preds_CT, path = f"./Adult_Res/{model_name}_CT.csv", force = True)

In [None]:
import pandas as pd

CT_res_all = pd.DataFrame()

import pandas as pd

file_path = './leadboard.csv'  
data = pd.read_csv(file_path)

df = pd.read_csv('./Dataset/adult_test_index.csv')
first_column = df.iloc[:, 0]

# Extracting the first column which contains the model names
model_names = data.iloc[:, 0].tolist()

# Loop through each model name, retrieve the model, and save it
for model_name in model_names: 
    # Load the files
    model_ct = pd.read_csv(f'./Adult_Res/{model_name}_CT.csv')
    model = pd.read_csv(f'./Adult_Res/{model_name}.csv')
    CT_res = 1-((model_ct['p0'] - model['p0']).abs())
    CT_res_rounded = CT_res.round(5)
    
    CT_res_all[f'{model_name}'] = CT_res_rounded


CT_res_all.insert(0, 'ID', first_column)

output_path = './Audlt_Pij.csv'
CT_res_all.to_csv(output_path, index=False)
