# Bank Marketing

In [13]:
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Define file paths
zip_file_path = 'Bank Marketing Dataset.zip'  # Path to your dataset zip file
extracted_folder = 'Bank_Marketing_Data'     # Folder where the zip is extracted

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)  # Extract to the folder

# Check the contents of the extracted folder
extracted_files = os.listdir(extracted_folder)
print(f"Files in extracted folder: {extracted_files}")

Files in extracted folder: ['bank-full.csv', 'bank-names.txt', 'bank.csv', 'case study.ipynb']


In [4]:
# Load the dataset (use 'bank-full.csv' or 'bank.csv')
csv_file_name = os.path.join(extracted_folder, 'bank-full.csv')  # Adjust to the correct file

# Load the dataset into a DataFrame
df = pd.read_csv(csv_file_name)

# Inspect the first few rows to verify the data
print(df.head())  # Display the first few rows of the dataset
print(df.columns)  # Display the column names

# Data Exploration
# Display basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 

In [5]:
# Data Preprocessing
# Clean column names by removing any leading/trailing spaces
df.columns = df.columns.str.strip()

# Handle missing values for numerical and categorical columns
df['age'] = df['age'].fillna(df['age'].median())  # For numerical columns
df['job'] = df['job'].fillna(df['job'].mode()[0])  # For categorical columns
df['marital'] = df['marital'].fillna(df['marital'].mode()[0])
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['balance'] = df['balance'].fillna(df['balance'].median())  # For numerical columns

# Convert 'y' column (target) to binary labels (yes=1, no=0)
df['y'] = df['y'].map({'yes': 1, 'no': 0})


In [8]:
# Encode binary categorical columns ('loan', 'housing', 'default') using LabelEncoder
binary_columns = ['loan', 'housing', 'default']
label_encoder = LabelEncoder()

for col in binary_columns:
    df[col] = label_encoder.fit_transform(df[col])  # Transform 'yes' to 1 and 'no' to 0

In [9]:
# Step 7: One-hot encoding for other categorical columns
categorical_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 8: Normalize numerical features (ONLY numerical columns)
numerical_cols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

# Apply StandardScaler to numerical columns only
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [14]:
# Step 9: Prepare features (X) and target (y)
X = df.drop('y', axis=1)  # Drop the target column 'y' to get the features
y = df['y']  # The target column is 'y'

# Step 10: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 11: Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Step 12: Train the model
model.fit(X_train, y_train)

# Step 13: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 14: Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.8425301338051532
Confusion Matrix:
[[6713 1239]
 [ 185  906]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.84      0.90      7952
           1       0.42      0.83      0.56      1091

    accuracy                           0.84      9043
   macro avg       0.70      0.84      0.73      9043
weighted avg       0.91      0.84      0.86      9043

