In [1]:
import pandas
import seaborn
import sklearn.preprocessing
import sklearn.impute
import sklearn.neural_network
import sklearn.feature_selection
from sklearn.feature_selection import SequentialFeatureSelector
%matplotlib inline

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Instructions

- Read both training and test data from CSV files
- Then, identify which columns are numerical and which columns are categorical
- Assess whether the data is balance or imbalance

In [3]:
data_train = pandas.read_csv("./data/train_data.csv").set_index("id")
data_train.head(2)

Unnamed: 0_level_0,social-security-number,house-number,age-group,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country-code,native-country,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2103,522215230.0,9013.0,2.0,self-emp-not-inc,335549.0,prof-school,15.0,never-married,prof-specialty,not-in-family,white,male,0.0,4.0,2.0,USA,united-states,1.0
14649,318423385.0,9914.0,0.0,private,98287.0,hs-grad,9.0,married-civ-spouse,sales,husband,white,male,0.0,0.0,3.0,USA,united-states,0.0


In [4]:
data_train=data_train.drop(['social-security-number','house-number','fnlwgt','native-country-code'], axis='columns')
data_train.head(2)

Unnamed: 0_level_0,age-group,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2103,2.0,self-emp-not-inc,prof-school,15.0,never-married,prof-specialty,not-in-family,white,male,0.0,4.0,2.0,united-states,1.0
14649,0.0,private,hs-grad,9.0,married-civ-spouse,sales,husband,white,male,0.0,0.0,3.0,united-states,0.0


In [5]:
data_test = pandas.read_csv("./data/test_data.csv").set_index("id")
data_test.head(2)

Unnamed: 0_level_0,social-security-number,house-number,age-group,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country-code,native-country,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
12849,150854577.0,249.0,1.0,private,509500.0,bachelors,13.0,never-married,tech-support,not-in-family,white,female,2.0,0.0,2.0,USA,united-states,1.0
1460,437726609.0,1112.0,0.0,private,243368.0,preschool,1.0,never-married,farming-fishing,not-in-family,white,male,0.0,0.0,3.0,MEX,mexico,0.0


In [6]:
data_test=data_test.drop(['social-security-number','house-number','fnlwgt','native-country-code'], axis='columns')
data_test.head(2)

Unnamed: 0_level_0,age-group,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12849,1.0,private,bachelors,13.0,never-married,tech-support,not-in-family,white,female,2.0,0.0,2.0,united-states,1.0
1460,0.0,private,preschool,1.0,never-married,farming-fishing,not-in-family,white,male,0.0,0.0,3.0,mexico,0.0


In [7]:
data_train.dtypes

age-group         float64
workclass          object
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capitalgain       float64
capitalloss       float64
hoursperweek      float64
native-country     object
label             float64
dtype: object

In [8]:
features = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'age-group',
    'education-num',
    'capitalgain',
    'capitalloss',
    'hoursperweek',
]

features

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'age-group',
 'education-num',
 'capitalgain',
 'capitalloss',
 'hoursperweek']

In [9]:
data_train_categorical_features = data_train[features]

In [10]:
data_test_categorical_features = data_test[features]

In [11]:
data_train['label'].value_counts()

label
0.0    9102
1.0    6573
Name: count, dtype: int64

In [12]:
data_test['label'].value_counts()

label
0.0    3033
1.0    2192
Name: count, dtype: int64

# Instructions
- Investigate whether the dataset contains any missing data

In [13]:
data_train.isna().sum()

age-group           6
workclass         792
education          15
education-num      12
marital-status     20
occupation        791
relationship       14
race               13
sex                14
capitalgain        17
capitalloss        14
hoursperweek       18
native-country    302
label               0
dtype: int64

In [14]:
data_test.isna().sum()

age-group           7
workclass         265
education           5
education-num       2
marital-status      3
occupation        263
relationship        7
race                9
sex                 8
capitalgain         6
capitalloss         7
hoursperweek        3
native-country     96
label               0
dtype: int64

# Instructions
- Use `.dropna()` to discard all missing data from datasets
- Investigate whether there are still any missing data in datasets after using the `.dropna()`

In [15]:
categorical_imputer = sklearn.impute.SimpleImputer(strategy='most_frequent')
categorical_imputer

In [16]:
categorical_imputer.fit(data_train_categorical_features)

In [17]:
data_train_imputed = pandas.DataFrame(
    data = categorical_imputer.transform(data_train_categorical_features),
    columns = categorical_imputer.get_feature_names_out(),
    index = data_train_categorical_features.index
    )

In [18]:
data_train_imputed.isna().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native-country    0
age-group         0
education-num     0
capitalgain       0
capitalloss       0
hoursperweek      0
dtype: int64

In [19]:
data_train_imputed.shape

(15675, 13)

In [20]:
data_test_imputed = pandas.DataFrame(
    data = categorical_imputer.transform(data_test_categorical_features),
    columns = categorical_imputer.get_feature_names_out(),
    index = data_test_categorical_features.index
    )

In [21]:
data_test_imputed.isna().sum()

workclass         0
education         0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
native-country    0
age-group         0
education-num     0
capitalgain       0
capitalloss       0
hoursperweek      0
dtype: int64

In [22]:
data_test_imputed.shape

(5225, 13)

# Ordinal features

In [23]:
ordinal_features = [
    'education',
    'age-group',
    'education-num',
    'capitalgain',
    'capitalloss',
    'hoursperweek',
]

# Instructions
- Use `.join` to reconstruct the processed training and test data

In [24]:
ordinal_mapping = [
    ['preschool','1st-4th','5th-6th','7th-8th','9th','10th','11th','12th',
     'hs-grad','some-college','assoc-voc','assoc-acdm','bachelors','masters',
     'prof-school','doctorate'],
    [0, 1, 2, 3, 4,],
    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
    [0, 1, 2, 3, 4,],
    [0, 1, 2, 3, 4,],
    [0, 1, 2, 3, 4,],
]

In [25]:
ordinal_encoder = sklearn.preprocessing.OrdinalEncoder(
    categories = ordinal_mapping
)

In [26]:
ordinal_encoder.fit(data_train_imputed[ordinal_features])

In [27]:
data_train_ordinal_features = pandas.DataFrame(
    data = ordinal_encoder.transform(data_train_imputed[ordinal_features]),
    columns = ordinal_encoder.get_feature_names_out(),
    index=data_train_imputed.index
)
data_train_ordinal_features

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2103,14.0,2.0,14.0,0.0,4.0,2.0
14649,8.0,0.0,8.0,0.0,0.0,3.0
7379,8.0,2.0,8.0,0.0,0.0,4.0
24479,9.0,2.0,9.0,0.0,0.0,3.0
19532,1.0,3.0,1.0,0.0,0.0,2.0
...,...,...,...,...,...,...
8695,9.0,0.0,9.0,0.0,0.0,2.0
2192,9.0,2.0,9.0,2.0,0.0,2.0
8250,8.0,0.0,8.0,0.0,0.0,2.0
18511,13.0,2.0,13.0,0.0,0.0,0.0


In [28]:
data_test_ordinal_features = pandas.DataFrame(
    data = ordinal_encoder.transform(data_test_imputed[ordinal_features]),
    columns = ordinal_encoder.get_feature_names_out(),
    index=data_test_imputed.index
)
data_test_ordinal_features

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12849,12.0,1.0,12.0,2.0,0.0,2.0
1460,0.0,0.0,0.0,0.0,0.0,3.0
13594,8.0,2.0,8.0,0.0,0.0,2.0
14400,8.0,3.0,8.0,0.0,0.0,2.0
14333,8.0,2.0,8.0,0.0,0.0,3.0
...,...,...,...,...,...,...
981,11.0,1.0,11.0,0.0,0.0,3.0
2952,11.0,2.0,11.0,0.0,0.0,2.0
9470,15.0,4.0,15.0,0.0,0.0,2.0
16559,8.0,4.0,8.0,0.0,4.0,2.0


## For nominal features
- Define a subset of nominal features
- Define a list of nominal maps which sort the feature values in appropriate orders
- Use `sklearn.preprocessing.OneHotEncoder` to create an nominal encoder, then fit using the training data
- Create a new `pandas.DataFrame` of only nominally encoded features for both training and test data

In [29]:
nominal_features = [
    'workclass',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
]
for n in nominal_features:
    print(data_train_imputed[n].unique())

['self-emp-not-inc' 'private' 'self-emp-inc' 'local-gov' 'federal-gov'
 'state-gov' 'never-worked' 'without-pay']
['never-married' 'married-civ-spouse' 'separated' 'divorced'
 'married-af-spouse' 'widowed' 'married-spouse-absent']
['prof-specialty' 'sales' 'exec-managerial' 'craft-repair'
 'machine-op-inspct' 'other-service' 'handlers-cleaners' 'tech-support'
 'transport-moving' 'farming-fishing' 'priv-house-serv' 'adm-clerical'
 'protective-serv' 'armed-forces']
['not-in-family' 'husband' 'other-relative' 'wife' 'own-child' 'unmarried']
['white' 'amer-indian-eskimo' 'black' 'asian-pac-islander' 'other']
['male' 'female']
['united-states' 'mexico' 'england' 'italy' 'el-salvador' 'canada'
 'germany' 'jamaica' 'ecuador' 'japan' 'dominican-republic' 'ireland'
 'philippines' 'cuba' 'vietnam' 'guatemala' 'taiwan' 'hungary' 'france'
 'outlying-us(guam-usvi-etc)' 'hong' 'cambodia' 'iran' 'china' 'columbia'
 'puerto-rico' 'yugoslavia' 'poland' 'greece' 'trinadad&tobago' 'india'
 'nicaragua' 's

In [30]:
nominal_mapping = [
    ['self-emp-not-inc', 'private', 'self-emp-inc', 'local-gov', 'federal-gov', 'state-gov', 'never-worked','without-pay'],
    ['never-married', 'married-civ-spouse', 'separated', 'divorced', 'married-af-spouse', 'married-spouse-absent', 'widowed'],
    ['prof-specialty', 'sales', 'exec-managerial', 'craft-repair', 'machine-op-inspct', 'other-service', 'handlers-cleaners', 'tech-support', 'transport-moving', 'farming-fishing', 'priv-house-serv', 'adm-clerical', 'protective-serv', 'armed-forces'],
    ['not-in-family', 'husband', 'other-relative', 'wife', 'own-child', 'unmarried'],
    ['white', 'amer-indian-eskimo', 'black', 'asian-pac-islander', 'other'],
    ['male', 'female'],
    ['united-states', 'mexico', 'italy', 'el-salvador', 'canada', 'germany', 'jamaica', 'ecuador', 'japan', 'dominican-republic', 'ireland', 'philippines', 'england', 'cuba', 'vietnam', 'guatemala', 'taiwan', 'hungary', 'france', 'hong', 'cambodia', 'iran', 'china', 'columbia', 'yugoslavia', 'poland', 'greece', 'trinadad&tobago', 'india', 'nicaragua', 'south', 'portugal', 'puerto-rico', 'outlying-us(guam-usvi-etc)', 'thailand', 'laos', 'peru', 'holand-netherlands', 'haiti', 'scotland', 'honduras']
]


In [31]:
onehot_encoder = sklearn.preprocessing.OneHotEncoder(
    categories = nominal_mapping,
    sparse_output = False
)
onehot_encoder

In [32]:
onehot_encoder.fit(data_train_imputed[nominal_features])

In [33]:
onehot_encoder.get_feature_names_out()

array(['workclass_self-emp-not-inc', 'workclass_private',
       'workclass_self-emp-inc', 'workclass_local-gov',
       'workclass_federal-gov', 'workclass_state-gov',
       'workclass_never-worked', 'workclass_without-pay',
       'marital-status_never-married',
       'marital-status_married-civ-spouse', 'marital-status_separated',
       'marital-status_divorced', 'marital-status_married-af-spouse',
       'marital-status_married-spouse-absent', 'marital-status_widowed',
       'occupation_prof-specialty', 'occupation_sales',
       'occupation_exec-managerial', 'occupation_craft-repair',
       'occupation_machine-op-inspct', 'occupation_other-service',
       'occupation_handlers-cleaners', 'occupation_tech-support',
       'occupation_transport-moving', 'occupation_farming-fishing',
       'occupation_priv-house-serv', 'occupation_adm-clerical',
       'occupation_protective-serv', 'occupation_armed-forces',
       'relationship_not-in-family', 'relationship_husband',
       'r

In [34]:
onehot_encoder.get_feature_names_out().shape

(83,)

In [35]:
data_train_nominal_features = pandas.DataFrame(
    data = onehot_encoder.transform(data_train_imputed[nominal_features]),
    columns = onehot_encoder.get_feature_names_out(),
    index=data_train_imputed.index
)
data_train_nominal_features

Unnamed: 0_level_0,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,workclass_federal-gov,workclass_state-gov,workclass_never-worked,workclass_without-pay,marital-status_never-married,marital-status_married-civ-spouse,...,native-country_portugal,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2103,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14649,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24479,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19532,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2192,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8250,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18511,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
data_test_nominal_features = pandas.DataFrame(
    data = onehot_encoder.transform(data_test_imputed[nominal_features]),
    columns = onehot_encoder.get_feature_names_out(),
    index=data_test_imputed.index
)
data_test_nominal_features

Unnamed: 0_level_0,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,workclass_federal-gov,workclass_state-gov,workclass_never-worked,workclass_without-pay,marital-status_never-married,marital-status_married-civ-spouse,...,native-country_portugal,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1460,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13594,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14400,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2952,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9470,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16559,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
data_train = (
    data_train_ordinal_features
    .join(data_train_nominal_features)
    .join(data_train['label'])
)
data_train

Unnamed: 0_level_0,education,age-group,education-num,capitalgain,capitalloss,hoursperweek,workclass_self-emp-not-inc,workclass_private,workclass_self-emp-inc,workclass_local-gov,...,native-country_puerto-rico,native-country_outlying-us(guam-usvi-etc),native-country_thailand,native-country_laos,native-country_peru,native-country_holand-netherlands,native-country_haiti,native-country_scotland,native-country_honduras,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2103,14.0,2.0,14.0,0.0,4.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14649,8.0,0.0,8.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7379,8.0,2.0,8.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24479,9.0,2.0,9.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19532,1.0,3.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8695,9.0,0.0,9.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2192,9.0,2.0,9.0,2.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8250,8.0,0.0,8.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18511,13.0,2.0,13.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [38]:
model = sklearn.neural_network.MLPClassifier(
    hidden_layer_sizes    = (20, 10),
    solver                = 'sgd', 
    activation            = 'logistic',
    alpha                 = 0.0,  
    batch_size            = 32,
    learning_rate         = 'constant',
    learning_rate_init    = 0.1,
    max_iter              = 1000,
    momentum              = 0.0, 
    nesterovs_momentum    = False, 
    validation_fraction   = 0.0, 
    shuffle               = True,
    random_state          = 0
)
model

In [39]:
# feature_selector = sklearn.feature_selection.SequentialFeatureSelector(
#     estimator = model,
#     n_features_to_select = "auto",
#     # n_features_to_select = 4,
#     direction = 'backward'
# )
feature_selector = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward')
feature_selector

In [None]:
feature_selector.fit(
    # X is uppercase because it is a Matrix
    X =data_train.drop(['label'], axis='columns'),
    y =data_train['label'],
)

In [None]:
# selected_features = feature_selector.get_feature_names_out().tolist()
selected_features = feature_selector.transform(data_train.drop(['label'], axis='columns'))
selected_features

In [None]:
data_test = (
    data_test_ordinal_features
    .join(data_test_nominal_features)
    .join(data_test['label'])
)
data_test

In [None]:
data_train_feature_selected = data_train[selected_features + ['label']]

In [None]:
data_test_feature_selected = data_test[selected_features + ['label']]

# Instructions
- Write the train data set to `./data/features.train.csv`
- Write the train data set to `./data/features.test.csv`

In [None]:
data_train.to_csv('./data/imputated_features.train.csv')

In [None]:
data_test.to_csv('./data/imputated_features.test.csv')