In [1]:
# Start coding...
import pandas as pd
import numpy as np

# Read in csv
marketing = pd.read_csv("bank_marketing.csv")

# Split into the three tables
client = marketing[["client_id", "age", "job", "marital", 
                    "education", "credit_default", "mortgage"]]
campaign = marketing[["client_id", "number_contacts", "month", "day", 
               "contact_duration", "previous_campaign_contacts", "previous_outcome", "campaign_outcome"]]
economics = marketing[["client_id", "cons_price_idx", "euribor_three_months"]]

In [2]:
## Editing the client dataset
# Clean education column
client.loc[:,"education"] = client["education"].str.replace(".", "_")
client.loc[:,"education"] = client["education"].replace("unknown", np.nan)

In [9]:
# Clean job column
client.loc[:, "job"] = client["job"].str.replace(".", "_")

# Clean and convert client columns to bool data type
for col in ["credit_default", "mortgage"]:
  client.loc[:,col] = client[col].map({"yes": 1,
                                 "no": 0,
                                 "unknown": 0})
  client.loc[:,col] = client[col].astype(bool)

  client.loc[:,col] = client[col].map({"yes": 1,
  client.loc[:,col] = client[col].astype(bool)
  client.loc[:,col] = client[col].map({"yes": 1,
  client.loc[:,col] = client[col].astype(bool)


### Campaign

In [10]:
# Editing the campaign dataset
# Change campaign_outcome to binary values
campaign.loc[:,"campaign_outcome"] = campaign["campaign_outcome"].map({"yes": 1, 
                                                                 "no": 0})

In [11]:
# Convert previous_outcome to binary values
campaign.loc[:,"previous_outcome"] = campaign["previous_outcome"].map({"success": 1, 
                                                                 "failure": 0,
                                                                 "nonexistent": 0})

In [14]:
# Add year column
campaign.loc[:,"year"] = "2022"

# Convert day to string
campaign.loc[:,"day"] = campaign["day"].astype(str)

# Add last_contact_date column
campaign.loc[:,"last_contact_date"] = campaign["year"] + "-" + campaign["month"] + "-" + campaign["day"]

In [15]:
campaign.head()

Unnamed: 0,client_id,number_contacts,month,day,contact_duration,previous_campaign_contacts,previous_outcome,campaign_outcome,year,last_contact_date
0,0,1,may,13,261,0,0,0,2022,2022-may-13
1,1,1,may,19,149,0,0,0,2022,2022-may-19
2,2,1,may,23,226,0,0,0,2022,2022-may-23
3,3,1,may,27,151,0,0,0,2022,2022-may-27
4,4,1,may,3,307,0,0,0,2022,2022-may-3


In [16]:
# Convert to datetime
campaign.loc[:,"last_contact_date"] = pd.to_datetime(campaign["last_contact_date"], 
                                               format="%Y-%b-%d")

In [17]:
# Clean and convert outcome columns to bool
for col in ["campaign_outcome", "previous_outcome"]:
  campaign.loc[:,col] = campaign[col].astype(bool)

In [18]:
# Drop unneccessary columns
campaign.drop(columns=["month", "day", "year"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  campaign.drop(columns=["month", "day", "year"], inplace=True)


In [19]:
campaign.head()

Unnamed: 0,client_id,number_contacts,contact_duration,previous_campaign_contacts,previous_outcome,campaign_outcome,last_contact_date
0,0,1,261,0,False,False,2022-05-13 00:00:00
1,1,1,149,0,False,False,2022-05-19 00:00:00
2,2,1,226,0,False,False,2022-05-23 00:00:00
3,3,1,151,0,False,False,2022-05-27 00:00:00
4,4,1,307,0,False,False,2022-05-03 00:00:00


In [21]:
client.head()

Unnamed: 0,client_id,age,job,marital,education,credit_default,mortgage
0,0,56,housemaid,married,basic_4y,True,True
1,1,57,services,married,high_school,True,True
2,2,37,services,married,high_school,True,True
3,3,40,admin_,married,basic_6y,True,True
4,4,56,services,married,high_school,True,True


In [22]:
economics.head()

Unnamed: 0,client_id,cons_price_idx,euribor_three_months
0,0,93.994,4.857
1,1,93.994,4.857
2,2,93.994,4.857
3,3,93.994,4.857
4,4,93.994,4.857


In [20]:
# Save tables to individual csv files
client.to_csv("client.csv", index=False)
campaign.to_csv("campaign.csv", index=False)
economics.to_csv("economics.csv", index=False)