In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib as mpl

In [272]:
file_path = 'phpkIxskf.arff'
data, meta = arff.loadarff(file_path)

df = pd.DataFrame(data)
for column in df.columns:
    if df[column].dtype == object:
        df[column] = df[column].str.decode('utf-8')

df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,Class
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown,1
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown,1
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown,1
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5.0,may,92.0,1.0,-1.0,0.0,unknown,1
4,33.0,unknown,single,unknown,no,1.0,no,no,unknown,5.0,may,198.0,1.0,-1.0,0.0,unknown,1


In [273]:
# Renaming all the headers to make the header much more sense
df.rename(
    columns= {'V1':'age', 
              'V2':'job', 
              'V3':'marital',
              'V4':'education',
              'V5':'credit_default',
              'V6':'balance',
              'V7':'housing_loan',
              'V8':'personal_loan',
              'V9':'communication_type',
              'V10':'last_contact_day',
              'V11':'last_contact_month',
              'V12':'last_contact_duration',
              'V13':'number_of_contacts',
              'V14':'pdays',
              'V15':'previous_contacts',
              'V16':'previous_outcome',
              'Class':'term_deposit'}, inplace=True
)

# Combining last_contact_day and last_contact_month to get last_contact_date
df['last_contact_date'] = df['last_contact_day'].astype(int).astype(str) + ' ' +  df['last_contact_month'].str.capitalize()

# Dropping last_contact_day & last_contact_month,
# and reordering last_contact_date to before last_contact_duration
df.drop(['last_contact_day', 'last_contact_month'], axis=1, inplace=True)
df = df[['age', 'job', 'marital', 'education', 'credit_default', 'balance',
       'housing_loan', 'personal_loan', 'communication_type','last_contact_date',
       'last_contact_duration', 'number_of_contacts', 'pdays',
       'previous_contacts', 'previous_outcome', 'term_deposit']]

# Changing type of term_deposit from 'O' to 'int'
df['term_deposit'] = df['term_deposit'].astype(int)

# Changing term_deposit to 0 = not subscribed & 1 = subcribed 
df.loc[df['term_deposit'] == 1, 'term_deposit'] = 0
df.loc[df['term_deposit'] == 2, 'term_deposit'] = 1

# Converting multiple columns of dataframes that have 'yes' or no' options to 1 or 0 respectively
# While ignoring other strings that have no ( yes, no) values using set() function
def convert_yes_no_to_binary():
    for col in df.columns:
        unique_values = df[col].unique()
        if set(unique_values) == {'yes', 'no'}:
            df[col] = df[col].map({'yes': 1, 'no': 0})
    return df
convert_yes_no_to_binary()

#Adjusting float to int and object to str
def dtype_adjusting(df):
    def convert_float_columns_to_int(df):
        for col in df.columns:
            if df[col].dtype == 'float64':
                df[col] = df[col].astype(int)
        return df
    def convert_object_columns_to_str(df):
        for col in df.columns:
            if df[col].dtype == 'float64':
                df[col] = df[col].astype(str)
        return df
    df = convert_float_columns_to_int(df)
    df = convert_object_columns_to_str(df)
    
    return df

df = dtype_adjusting(df)

In [274]:
df.head()

Unnamed: 0,age,job,marital,education,credit_default,balance,housing_loan,personal_loan,communication_type,last_contact_date,last_contact_duration,number_of_contacts,pdays,previous_contacts,previous_outcome,term_deposit
0,58,management,married,tertiary,0,2143,1,0,unknown,5 May,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5 May,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5 May,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5 May,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5 May,198,1,-1,0,unknown,0


In [223]:
df.corr()

Unnamed: 0,age,credit_default,balance,housing_loan,personal_loan,last_contact_duration,number_of_contacts,pdays,previous_contacts,term_deposit
age,1.0,-0.017879,0.097783,-0.185513,-0.015655,-0.004648,0.00476,-0.023758,0.001288,0.025155
credit_default,-0.017879,1.0,-0.066745,-0.006025,0.077234,-0.010021,0.016822,-0.029979,-0.018329,-0.022419
balance,0.097783,-0.066745,1.0,-0.068768,-0.08435,0.02156,-0.014578,0.003435,0.016674,0.052838
housing_loan,-0.185513,-0.006025,-0.068768,1.0,0.041323,0.005075,-0.023599,0.124178,0.037076,-0.139173
personal_loan,-0.015655,0.077234,-0.08435,0.041323,1.0,-0.012412,0.00998,-0.022754,-0.011043,-0.068185
last_contact_duration,-0.004648,-0.010021,0.02156,0.005075,-0.012412,1.0,-0.08457,-0.001565,0.001203,0.394521
number_of_contacts,0.00476,0.016822,-0.014578,-0.023599,0.00998,-0.08457,1.0,-0.088628,-0.032855,-0.073172
pdays,-0.023758,-0.029979,0.003435,0.124178,-0.022754,-0.001565,-0.088628,1.0,0.45482,0.103621
previous_contacts,0.001288,-0.018329,0.016674,0.037076,-0.011043,0.001203,-0.032855,0.45482,1.0,0.093236
term_deposit,0.025155,-0.022419,0.052838,-0.139173,-0.068185,0.394521,-0.073172,0.103621,0.093236,1.0
