In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("./dataset/bank-additional-full.csv")
display(dataset)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


### Removing unknown fields for
* marital column
* default column
* housing column
* loan column

<p>
    There are the columns which consists of unknown fields like job, marital, education, default, housing, loan. But, every clients might not give all the information like of job.
    So, the unknown field for job is not removed. 
    But, the unknown field on default, housing and loan might have much effect on the output, therefore the unknown fields are removed for such fields. 
</p>

In [3]:
# Generalized function to remove the row containing unknown field in a specific column
def remove_row(column: str, value:str="unknown") -> None:
    if value not in dataset[column].unique():
        raise Exception(f"No such value {value} in {column} column")
    
    dataset.drop(dataset[dataset[column]==value].index, axis=0, inplace=True)

In [9]:
def check_unique_values(column:str) -> None:
    display(dataset[column].unique())
    display(dataset[column].value_counts())

In [10]:
# Removing unknown field of marital column
# Only 80 unknown fields are present in marital column
remove_row(column="marital", value="unknown")
dataset.info()
check_unique_values("marital")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41108 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41108 non-null  int64  
 1   job             41108 non-null  object 
 2   marital         41108 non-null  object 
 3   education       41108 non-null  object 
 4   default         41108 non-null  object 
 5   housing         41108 non-null  object 
 6   loan            41108 non-null  object 
 7   contact         41108 non-null  object 
 8   month           41108 non-null  object 
 9   day_of_week     41108 non-null  object 
 10  duration        41108 non-null  int64  
 11  campaign        41108 non-null  int64  
 12  pdays           41108 non-null  int64  
 13  previous        41108 non-null  int64  
 14  poutcome        41108 non-null  object 
 15  emp.var.rate    41108 non-null  float64
 16  cons.price.idx  41108 non-null  float64
 17  cons.conf.idx   41108 non-null 

array(['married', 'single', 'divorced'], dtype=object)

married     24928
single      11568
divorced     4612
Name: marital, dtype: int64

#### 80 rows are removed, so total rows are 41108.

In [11]:
# Removing unknown field for education field
# Consists of 1731 unknown fields
remove_row(column="education", value="unknown")
dataset.info()
check_unique_values("education")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39386 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             39386 non-null  int64  
 1   job             39386 non-null  object 
 2   marital         39386 non-null  object 
 3   education       39386 non-null  object 
 4   default         39386 non-null  object 
 5   housing         39386 non-null  object 
 6   loan            39386 non-null  object 
 7   contact         39386 non-null  object 
 8   month           39386 non-null  object 
 9   day_of_week     39386 non-null  object 
 10  duration        39386 non-null  int64  
 11  campaign        39386 non-null  int64  
 12  pdays           39386 non-null  int64  
 13  previous        39386 non-null  int64  
 14  poutcome        39386 non-null  object 
 15  emp.var.rate    39386 non-null  float64
 16  cons.price.idx  39386 non-null  float64
 17  cons.conf.idx   39386 non-null 

array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
       'professional.course', 'university.degree', 'illiterate'],
      dtype=object)

university.degree      12137
high.school             9501
basic.9y                6037
professional.course     5237
basic.4y                4170
basic.6y                2286
illiterate                18
Name: education, dtype: int64

There was all total 1731 unknwon fields in education column, but while removing unkown field from marital column, 
it must have removed some of the unknown fields from education too whose corresponding field in marital is unknown too.
So, after removing unknown fields from education, all together there are 39368 rows.

In [12]:
# Removing unknown fields from default column
remove_row(column="default")
dataset.info()
check_unique_values(column="default")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31344 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             31344 non-null  int64  
 1   job             31344 non-null  object 
 2   marital         31344 non-null  object 
 3   education       31344 non-null  object 
 4   default         31344 non-null  object 
 5   housing         31344 non-null  object 
 6   loan            31344 non-null  object 
 7   contact         31344 non-null  object 
 8   month           31344 non-null  object 
 9   day_of_week     31344 non-null  object 
 10  duration        31344 non-null  int64  
 11  campaign        31344 non-null  int64  
 12  pdays           31344 non-null  int64  
 13  previous        31344 non-null  int64  
 14  poutcome        31344 non-null  object 
 15  emp.var.rate    31344 non-null  float64
 16  cons.price.idx  31344 non-null  float64
 17  cons.conf.idx   31344 non-null 

array(['no', 'yes'], dtype=object)

no     31341
yes        3
Name: default, dtype: int64

The unknwon field in default might hamper alot in output. So, it would be better to remove the unknown field rather than replacing it with false data even if it reduce most of the data.

In [13]:
# Removing unknown fields from housing column
remove_row(column="housing")
dataset.info()
check_unique_values(column="housing")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30604 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             30604 non-null  int64  
 1   job             30604 non-null  object 
 2   marital         30604 non-null  object 
 3   education       30604 non-null  object 
 4   default         30604 non-null  object 
 5   housing         30604 non-null  object 
 6   loan            30604 non-null  object 
 7   contact         30604 non-null  object 
 8   month           30604 non-null  object 
 9   day_of_week     30604 non-null  object 
 10  duration        30604 non-null  int64  
 11  campaign        30604 non-null  int64  
 12  pdays           30604 non-null  int64  
 13  previous        30604 non-null  int64  
 14  poutcome        30604 non-null  object 
 15  emp.var.rate    30604 non-null  float64
 16  cons.price.idx  30604 non-null  float64
 17  cons.conf.idx   30604 non-null 

array(['no', 'yes'], dtype=object)

yes    16577
no     14027
Name: housing, dtype: int64

<p><span style="color: red;">Note:</span> Removal of unkwon fields from housing removed all the unknown fields from the loan column.</P>

In [16]:
# Removing unknown fields from loan column
# remove_row(column="loan")
dataset.info()
check_unique_values(column="loan")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30604 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             30604 non-null  int64  
 1   job             30604 non-null  object 
 2   marital         30604 non-null  object 
 3   education       30604 non-null  object 
 4   default         30604 non-null  object 
 5   housing         30604 non-null  object 
 6   loan            30604 non-null  object 
 7   contact         30604 non-null  object 
 8   month           30604 non-null  object 
 9   day_of_week     30604 non-null  object 
 10  duration        30604 non-null  int64  
 11  campaign        30604 non-null  int64  
 12  pdays           30604 non-null  int64  
 13  previous        30604 non-null  int64  
 14  poutcome        30604 non-null  object 
 15  emp.var.rate    30604 non-null  float64
 16  cons.price.idx  30604 non-null  float64
 17  cons.conf.idx   30604 non-null 

array(['no', 'yes'], dtype=object)

no     25810
yes     4794
Name: loan, dtype: int64