In [1]:
%%time
!pip install pandas matplotlib seaborn numpy

CPU times: total: 31.2 ms
Wall time: 2.77 s



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline
plt.style.use('seaborn-v0_8')

# Create output directory for figures
os.makedirs('notebook_output', exist_ok=True)

In [3]:

# Load and preview the df
banking = pd.read_csv('temp_csv/banking.csv')
print("data shape:", banking.shape)
print("\nFirst few rows:")
print(banking.head())
print("Columns:", banking.columns.tolist())


data shape: (41188, 8)

First few rows:
   age          job  marital          education  default housing loan  \
0   44  blue-collar  married           basic.4y  unknown     yes   no   
1   53   technician  married            unknown       no      no   no   
2   28   management   single  university.degree       no     yes   no   
3   39     services  married        high.school       no      no   no   
4   55      retired  married           basic.4y       no     yes   no   

   duration  
0       210  
1       138  
2       339  
3       185  
4       137  
Columns: ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'duration']


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set the style for seaborn
sns.set(style='whitegrid')

# Create a directory for saving images
import os
if not os.path.exists('banking'):
    os.makedirs('banking')

# Visualize the distribution of age
plt.figure(figsize=(10, 6))
sns.histplot(banking['age'], bins=30, kde=True, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('notebook_output/banking/age_distribution.png')
plt.close()

# Visualize the distribution of duration
plt.figure(figsize=(10, 6))
sns.histplot(banking['duration'], bins=30, kde=True, color='green')
plt.title('Duration Distribution')
plt.xlabel('Duration')
plt.ylabel('Frequency')
plt.savefig('notebook_output/banking/duration_distribution.png')
plt.close()

# Boxplot for age by job
plt.figure(figsize=(12, 8))
sns.boxplot(x='job', y='age', data=banking)
plt.xticks(rotation=45)
plt.title('Age by Job')
plt.xlabel('Job')
plt.ylabel('Age')
plt.savefig('notebook_output/banking/age_by_job.png')
plt.close()

# Boxplot for duration by marital status
plt.figure(figsize=(12, 8))
sns.boxplot(x='marital', y='duration', data=banking)
plt.title('Duration by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Duration')
plt.savefig('notebook_output/banking/duration_by_marital.png')
plt.close()

# Correlation heatmap for numerical features
numerical_cols = ['age', 'duration']
correlation_matrix = banking[numerical_cols].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('notebook_output/banking/correlation_heatmap.png')
plt.close()

# Countplot for categorical features
plt.figure(figsize=(10, 6))
sns.countplot(y='job', data=banking, order=banking['job'].value_counts().index)
plt.title('Job Distribution')
plt.xlabel('Count')
plt.ylabel('Job')
plt.savefig('notebook_output/banking/job_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='marital', data=banking)
plt.title('Marital Status Distribution')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.savefig('notebook_output/banking/marital_status_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='education', data=banking)
plt.title('Education Distribution')
plt.xlabel('Education')
plt.ylabel('Count')
plt.savefig('notebook_output/banking/education_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='default', data=banking)
plt.title('Default Distribution')
plt.xlabel('Default')
plt.ylabel('Count')
plt.savefig('notebook_output/banking/default_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='housing', data=banking)
plt.title('Housing Loan Distribution')
plt.xlabel('Housing Loan')
plt.ylabel('Count')
plt.savefig('notebook_output/banking/housing_loan_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='loan', data=banking)
plt.title('Personal Loan Distribution')
plt.xlabel('Personal Loan')
plt.ylabel('Count')
plt.savefig('notebook_output/banking/personal_loan_distribution.png')
plt.close()