# imports and data loading

In [33]:
import pandas as pd
import numpy as np

df = pd.read_csv("bank-additional-full.csv", sep=';')
print(df.head(10))
print("\n")


   age          job  marital            education  default housing loan  \
0   56    housemaid  married             basic.4y       no      no   no   
1   57     services  married          high.school  unknown      no   no   
2   37     services  married          high.school       no     yes   no   
3   40       admin.  married             basic.6y       no      no   no   
4   56     services  married          high.school       no      no  yes   
5   45     services  married             basic.9y  unknown      no   no   
6   59       admin.  married  professional.course       no      no   no   
7   41  blue-collar  married              unknown  unknown      no   no   
8   24   technician   single  professional.course       no     yes   no   
9   25     services   single          high.school       no     yes   no   

     contact month day_of_week  ...  campaign  pdays  previous     poutcome  \
0  telephone   may         mon  ...         1    999         0  nonexistent   
1  telephone   m

In [34]:
print(df.info())
print("\n")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [35]:
print(df.isnull().sum().sort_values(ascending=False))

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [36]:
print(df.describe())

               age      duration      campaign         pdays      previous  \
count  41188.00000  41188.000000  41188.000000  41188.000000  41188.000000   
mean      40.02406    258.285010      2.567593    962.475454      0.172963   
std       10.42125    259.279249      2.770014    186.910907      0.494901   
min       17.00000      0.000000      1.000000      0.000000      0.000000   
25%       32.00000    102.000000      1.000000    999.000000      0.000000   
50%       38.00000    180.000000      2.000000    999.000000      0.000000   
75%       47.00000    319.000000      3.000000    999.000000      0.000000   
max       98.00000   4918.000000     56.000000    999.000000      7.000000   

       emp.var.rate  cons.price.idx  cons.conf.idx     euribor3m   nr.employed  
count  41188.000000    41188.000000   41188.000000  41188.000000  41188.000000  
mean       0.081886       93.575664     -40.502600      3.621291   5167.035911  
std        1.570960        0.578840       4.628198    

# 1) What dataset did you choose and why? (Include the link to the Kaggle page).

#### I choose a bank dataset with many columns because I think work with it will be more interesting

## 2) Based on your diagnosis, what are the top 2-3 problems you see with this dataset? 
##    (e.g., "The 'revenue' column has many missing values," or "The 'runtime' column 
##    is stored as an object/string instead of a number.")

#### there are object datatype columns which should be binary or should contain information about the day of the week in 
#### a form that is easy to work with

# Data cleaning

In [37]:
# Replace 'unknown' with NaN for easier handling of missing values if there are any
df = df.replace('unknown', pd.NA)

# Convert categorical columns to appropriate data types
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    df[col] = df[col].astype('category')
print(df.info())

# Example: Convert 'day_of_week' to numerical representation
day_mapping = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5}
df['day_of_week'] = df['day_of_week'].map(day_mapping)
print(df['day_of_week'].head())

# Convert 'yes'/'no' to 1/0 for binary columns
binary_cols = ['default', 'housing', 'loan', 'y']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# Encode remaining categorical variables numerically
for col in df.select_dtypes(include='category').columns:
    df[col] = df[col].cat.codes  # assigns integer codes

# Drop contact column as it may not be useful for analysis
df = df.drop(['contact'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             41188 non-null  int64   
 1   job             40858 non-null  category
 2   marital         41108 non-null  category
 3   education       39457 non-null  category
 4   default         32591 non-null  category
 5   housing         40198 non-null  category
 6   loan            40198 non-null  category
 7   contact         41188 non-null  category
 8   month           41188 non-null  category
 9   day_of_week     41188 non-null  category
 10  duration        41188 non-null  int64   
 11  campaign        41188 non-null  int64   
 12  pdays           41188 non-null  int64   
 13  previous        41188 non-null  int64   
 14  poutcome        41188 non-null  category
 15  emp.var.rate    41188 non-null  float64 
 16  cons.price.idx  41188 non-null  float64 
 17  cons.conf.id

In [None]:
df.isna().sum() # So there are no missing values

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

# Step 6: Verify Your Work

#### ○ After performing your cleaning steps, run df.info() and df.isnull().sum() again.

#### ○ In a new Markdown cell, briefly explain how your cleaning actions changed the output of these commands. (e.g., "After filling the missing values in the 'revenue' column, it now shows 0 nulls. After cleaning the 'runtime' column, its Dtype is now float64 instead of object.")

In [42]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  int8   
 2   marital         41188 non-null  int8   
 3   education       41188 non-null  int8   
 4   default         41188 non-null  int8   
 5   housing         41188 non-null  int8   
 6   loan            41188 non-null  int8   
 7   month           41188 non-null  int8   
 8   day_of_week     41188 non-null  int8   
 9   duration        41188 non-null  int64  
 10  campaign        41188 non-null  int64  
 11  pdays           41188 non-null  int64  
 12  previous        41188 non-null  int64  
 13  poutcome        41188 non-null  int8   
 14  emp.var.rate    41188 non-null  float64
 15  cons.price.idx  41188 non-null  float64
 16  cons.conf.idx   41188 non-null  float64
 17  euribor3m       41188 non-null 

In [43]:
print(df.isnull().sum().sort_values(ascending=False))

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


#### After cleaning our data is much better for future analysis. I replace all categorical columns to pandas category datatype, mapped all yes/no to 1/0, encoded all remaining categorical columns to ints, converted day_of_week into numeric values 1-7, dropped contact column that we can't use.