In [79]:
## Library ##
import pandas as pd
import numpy as np
import openml as op
import seaborn as sns

In [80]:
## Fetch data
dataset = op.datasets.functions.get_dataset(31)
data = dataset.get_data()

In [81]:
# Dataframe
x = data[0].drop(columns='class')
y = data[0]['class']
df = data[0]

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   uint8   
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   float64 
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   uint8   
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   uint8   
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   uint8   
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

In [83]:
## Cleaning data

categorical_columns = ['personal_status', ]

sex_status = x['personal_status'].str.extract(r'(?P<sex>[\w]+)\s(?P<status>[\w/]+)')
# Dropping "male" column, asumming that a person can't be both sex.
# female: 1 -- male : 0
x_new = x.copy(deep=False)
x_new['female'] = pd.get_dummies(sex_status['sex']).drop(columns='male')
x_new = pd.concat([x, pd.get_dummies(sex_status['status'])], axis=1)


# removing initial columns
x_new = x_new.drop(columns=categorical_columns)

x_new = pd.get_dummies(x_new)
x_new.head()

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,div/dep/mar,div/sep,mar/wid,...,housing_own,housing_for free,job_unemp/unskilled non res,job_unskilled resident,job_skilled,job_high qualif/self emp/mgmt,own_telephone_none,own_telephone_yes,foreign_worker_yes,foreign_worker_no
0,6,1169.0,4,4,67,2,1,0,0,0,...,1,0,0,0,1,0,0,1,1,0
1,48,5951.0,2,2,22,1,1,1,0,0,...,1,0,0,0,1,0,1,0,1,0
2,12,2096.0,2,3,49,1,2,0,0,0,...,1,0,0,1,0,0,1,0,1,0
3,42,7882.0,2,4,45,1,2,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,24,4870.0,3,4,53,2,2,0,0,0,...,0,1,0,0,1,0,1,0,1,0


In [84]:
y_new = y.cat.codes
y_new.name = 'class'
Y_DICT = {cat: code for cat, code in zip(y.cat.categories, range(len(y.cat.categories)))}

In [85]:
y_new

0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: class, Length: 1000, dtype: int8

In [86]:
from sklearn.feature_selection import SelectFpr, chi2

In [87]:
# Feature selection
alpha = 0.90
f_selector = SelectFpr(alpha=alpha, score_func=chi2)
f_selector.fit(x_new, y_new)
x_new_values = f_selector.transform(x_new)

In [88]:
x_new = pd.DataFrame(x_new_values, columns=f_selector.get_feature_names_out())
x_new.head()

Unnamed: 0,duration,credit_amount,installment_commitment,age,existing_credits,div/dep/mar,div/sep,mar/wid,single,checking_status_<0,...,housing_own,housing_for free,job_unemp/unskilled non res,job_unskilled resident,job_skilled,job_high qualif/self emp/mgmt,own_telephone_none,own_telephone_yes,foreign_worker_yes,foreign_worker_no
0,6.0,1169.0,4.0,67.0,2.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,48.0,5951.0,2.0,22.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,12.0,2096.0,2.0,49.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,42.0,7882.0,2.0,45.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,24.0,4870.0,3.0,53.0,2.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [89]:
## Save data
x_new.to_pickle('./data/X.pickle')
y_new.to_pickle('./data/Y.pickle')


In [90]:
# x_new = pd.get_dummies(x)

In [91]:
(pd.concat([x_new, y_new], axis=1)).to_pickle('./data/cleaned.pickle')