In [1]:
# Import the pandas library for data handling
import pandas as pd

# Load the dataset. 
# Note: This file uses semicolons (;) instead of commas to separate values, 
# so we need to tell pandas by using sep=';'
df = pd.read_csv('bank.csv', sep=';')

# --- Initial Inspection ---

# 1. Look at the first 5 rows to see the columns
print("--- First 5 Rows ---")
print(df.head())
print("\n" + "="*30 + "\n")

# 2. Get a summary of the columns, data types, and any missing values
print("--- Dataset Info ---")
df.info()

--- First 5 Rows ---
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 t

In [2]:
# --- Step 3: Data Preprocessing ---

# First, let's convert our target variable 'y' from yes/no to 1/0
# The .map() function is a simple way to do this
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# We need to identify all the columns that are text (dtype 'object')
# These are the columns we need to one-hot encode
categorical_cols = df.select_dtypes(include='object').columns

# Apply one-hot encoding using pandas get_dummies
# drop_first=True helps to avoid redundancy
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


# Now, we separate our data into features (X) and the target (y)
# X contains all the columns we use for prediction
X = df_processed.drop('y', axis=1)

# y contains only the column we want to predict
y = df_processed['y']


# Let's look at the first few rows of our new, processed data (X)
print("--- First 5 rows of our processed features (X) ---")
print(X.head())

print(f"\nOur data now has {X.shape[1]} columns after one-hot encoding.")

--- First 5 rows of our processed features (X) ---
   age  balance  day  duration  campaign  pdays  previous  job_blue-collar  \
0   30     1787   19        79         1     -1         0            False   
1   33     4789   11       220         1    339         4            False   
2   35     1350   16       185         1    330         1            False   
3   30     1476    3       199         4     -1         0            False   
4   59        0    5       226         1     -1         0             True   

   job_entrepreneur  job_housemaid  ...  month_jul  month_jun  month_mar  \
0             False          False  ...      False      False      False   
1             False          False  ...      False      False      False   
2             False          False  ...      False      False      False   
3             False          False  ...      False       True      False   
4             False          False  ...      False      False      False   

   month_may  month_nov

In [8]:
# Import the function we need from the scikit-learn library
from sklearn.model_selection import train_test_split

# Split our features (X) and target (y) into training and testing sets
# test_size=0.2 means 20% of the data will be for testing
# random_state=42 ensures that we get the same 'random' split every time we run this code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Let's print the size of each set to confirm the split
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (3616, 42)
Testing data shape: (905, 42)


In [7]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/15/fa/c61a787e35f05f17fc10523f567677ec4eeee5f95aa4798dbbbcd9625617/scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata
  Downloading scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.8.0 from https://files.pythonhosted.org/packages/ea/b5/29fece1a74c6a94247f8a6fb93f5b28b533338e9c34fdcc9cfe7a939a767/scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata
  Using cached scipy-1.16.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl.metadata
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from sciki


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# Import the Decision Tree model from the scikit-learn library
from sklearn.tree import DecisionTreeClassifier

# Create a blank instance of our Decision Tree model
# random_state=42 ensures the model is built the same way every time
model = DecisionTreeClassifier(random_state=42)

# Now, we 'fit' or 'train' the model using our training data (X_train, y_train)
# This is where the model learns all the patterns!
model.fit(X_train, y_train)

print("Decision Tree model has been successfully trained!")

Decision Tree model has been successfully trained!


In [10]:
# Import the function for calculating accuracy
from sklearn.metrics import accuracy_score

# Ask our trained model to make predictions on the test data
y_predictions = model.predict(X_test)

# Compare the model's predictions (y_predictions) to the actual answers (y_test)
accuracy = accuracy_score(y_test, y_predictions)

# Print the accuracy score, formatted as a percentage
print(f"Model Accuracy on the Test Set: {accuracy * 100:.2f}%")

Model Accuracy on the Test Set: 88.07%
