### Importing libraries

In [1]:
import pandas
import seaborn
import sklearn.preprocessing
import sklearn.impute
import sklearn.feature_selection
import joblib
import json
%matplotlib inline

### Loading the test data

In [2]:
df = pandas.read_csv('./data/high_salary.test_features.csv').set_index("id")

### Dropping irrelevant columns

In [3]:
df=df.drop(['social-security-number','house-number','fnlwgt','native-country-code'], axis='columns')
df.head(2)

Unnamed: 0_level_0,age-group,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6111,0.0,private,11th,7.0,married-civ-spouse,protective-serv,husband,white,male,0.0,0.0,2.0,united-states
11214,1.0,private,hs-grad,9.0,never-married,sales,unmarried,white,female,0.0,0.0,1.0,mexico


In [4]:
df.shape

(6967, 13)

In [5]:
features = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'age-group',
    'education-num',
    'capitalgain',
    'capitalloss',
    'hoursperweek',
]


In [6]:
data_test_categorical_features = df[features]

In [7]:
data_test_categorical_features.isna().sum()

workclass         345
education           7
marital-status      9
occupation        347
relationship       13
race                7
sex                 7
native-country    123
age-group          13
education-num       8
capitalgain         8
capitalloss        10
hoursperweek       13
dtype: int64

In [8]:
categorical_imputer = sklearn.impute.SimpleImputer(strategy='most_frequent')
categorical_imputer

In [9]:
categorical_imputer.fit(data_test_categorical_features)

In [10]:
data_test_imputed = pandas.DataFrame(
    data = categorical_imputer.transform(data_test_categorical_features),
    columns = categorical_imputer.get_feature_names_out(),
    index = data_test_categorical_features.index
    )

In [11]:
data_test_imputed.isna().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native-country    0
age-group         0
education-num     0
capitalgain       0
capitalloss       0
hoursperweek      0
dtype: int64

In [12]:
ordinal_features = [
    'education',
    'age-group',
    'education-num',
    'capitalgain',
    'capitalloss',
    'hoursperweek',
]

In [13]:
ordinal_mapping = [
    ['preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th',
     'hs-grad','some-college','assoc-voc','assoc-acdm','bachelors','masters',
     'prof-school','doctorate'],
    [0, 1, 2, 3, 4,],
    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
    [0, 1, 2, 3, 4,],
    [0, 1, 2, 3, 4,],
    [0, 1, 2, 3, 4,],
]

In [14]:
ordinal_encoder = sklearn.preprocessing.OrdinalEncoder(
    categories = ordinal_mapping
)

In [15]:
ordinal_encoder.fit(data_test_imputed[ordinal_features])

In [16]:
data_test_ordinal_features = pandas.DataFrame(
    data = ordinal_encoder.transform(data_test_imputed[ordinal_features]),
    columns = ordinal_encoder.get_feature_names_out(),
    index=data_test_imputed.index
)
data_test_ordinal_features

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6111,6.0,0.0,6.0,0.0,0.0,2.0
11214,8.0,1.0,8.0,0.0,0.0,1.0
5554,8.0,4.0,8.0,0.0,0.0,3.0
25131,13.0,2.0,13.0,0.0,0.0,2.0
14324,8.0,3.0,8.0,0.0,0.0,2.0
...,...,...,...,...,...,...
25998,12.0,4.0,12.0,0.0,0.0,3.0
8375,8.0,4.0,8.0,0.0,0.0,4.0
13888,6.0,1.0,6.0,0.0,0.0,1.0
4159,13.0,1.0,13.0,0.0,0.0,2.0


In [17]:
nominal_features = [
    'workclass',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
]

In [18]:
nominal_mapping = [
    ['self-emp-not-inc', 'private', 'self-emp-inc', 'local-gov', 'federal-gov', 'state-gov', 'never-worked','without-pay'],
    ['never-married', 'married-civ-spouse', 'separated', 'divorced', 'married-af-spouse', 'married-spouse-absent', 'widowed'],
    ['prof-specialty', 'sales', 'exec-managerial', 'craft-repair', 'machine-op-inspct', 'other-service', 'handlers-cleaners', 'tech-support', 'transport-moving', 'farming-fishing', 'priv-house-serv', 'adm-clerical', 'protective-serv', 'armed-forces'],
    ['not-in-family', 'husband', 'other-relative', 'wife', 'own-child', 'unmarried'],
    ['white', 'amer-indian-eskimo', 'black', 'asian-pac-islander', 'other'],
    ['male', 'female'],
    ['united-states', 'mexico', 'italy', 'el-salvador', 'canada', 'germany', 'jamaica', 'ecuador', 'japan', 'dominican-republic', 'ireland', 'philippines', 'england', 'cuba', 'vietnam', 'guatemala', 'taiwan', 'hungary', 'france', 'hong', 'cambodia', 'iran', 'china', 'columbia', 'yugoslavia', 'poland', 'greece', 'trinadad&tobago', 'india', 'nicaragua', 'south', 'portugal', 'puerto-rico', 'outlying-us(guam-usvi-etc)', 'thailand', 'laos', 'peru', 'holand-netherlands', 'haiti', 'scotland', 'honduras']
]


In [19]:
onehot_encoder = sklearn.preprocessing.OneHotEncoder(
    categories = nominal_mapping,
    sparse_output = False
)
onehot_encoder

In [20]:
onehot_encoder.fit(data_test_imputed[nominal_features])

In [21]:
onehot_encoder.get_feature_names_out().shape

(83,)

In [22]:
data_test_nominal_features = pandas.DataFrame(
    data = onehot_encoder.transform(data_test_imputed[nominal_features]),
    columns = onehot_encoder.get_feature_names_out(),
    index=data_test_imputed.index
)
data_test_nominal_features

Unnamed: 0_level_0,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,workclass_federal-gov,workclass_state-gov,workclass_never-worked,workclass_without-pay,marital-status_never-married,marital-status_married-civ-spouse,...,native-country_portugal,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6111,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11214,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5554,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25131,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14324,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13888,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4159,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
data_test = (
    data_test_ordinal_features
    .join(data_test_nominal_features)
)
data_test

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,...,native-country_portugal,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6111,6.0,0.0,6.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11214,8.0,1.0,8.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5554,8.0,4.0,8.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25131,13.0,2.0,13.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14324,8.0,3.0,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25998,12.0,4.0,12.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8375,8.0,4.0,8.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13888,6.0,1.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4159,13.0,1.0,13.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
feature_selector = sklearn.feature_selection.SequentialFeatureSelector(
    estimator = model,
    n_features_to_select = "auto",
    # n_features_to_select = 4,
    direction = 'backward'
)
feature_selector

In [None]:
feature_selector.fit(
    # X is uppercase because it is a Matrix
    X =data_train.drop(),
    y =data_train['death_event'],
)

In [None]:
selected_features = feature_selector.get_feature_names_out().tolist()
selected_features

In [None]:
data_train_feature_selected = data_train[selected_features + ['death_event']]

### loading models

In [24]:
data_test_ann_drop = data_test.copy()
data_test_ann_imputed = data_test.copy()

In [25]:
ann_imputed = joblib.load('model/model_ANN_imputed.joblib')
ann_imputed

In [26]:
ann_drop_na = joblib.load('model/model_ANN_drop_na.joblib')
ann_drop_na

In [27]:
data_test_ann_imputed['prediction'] = ann_imputed.predict(data_test_ann_imputed)
data_test_ann_imputed

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,...,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6111,6.0,0.0,6.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11214,8.0,1.0,8.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5554,8.0,4.0,8.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
25131,13.0,2.0,13.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14324,8.0,3.0,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25998,12.0,4.0,12.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8375,8.0,4.0,8.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
13888,6.0,1.0,6.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4159,13.0,1.0,13.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [28]:
prediction = pandas.DataFrame(
    data = data_test_ann_imputed['prediction'],
    index=data_test_ann_imputed.index
)
prediction

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
6111,0
11214,0
5554,1
25131,1
14324,1
...,...
25998,1
8375,0
13888,0
4159,1


In [29]:
prediction

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
6111,0
11214,0
5554,1
25131,1
14324,1
...,...
25998,1
8375,0
13888,0
4159,1


In [30]:
prediction.to_csv('./results/1_predictions.test.csv')