In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load the full dataset
df_full = pd.read_csv('bank-full.csv', delimiter=';')
df_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Display basic information about the datasets
print("Full dataset info:")
print(df_full.info())

Full dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


In [5]:
# Load the 10% dataset
df_small = pd.read_csv('bank.csv', delimiter=';')
df_small.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [6]:
print("Small dataset info:")
print(df_small.info())

Small dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB
None


In [81]:
# Preprocess the data (cleaning up column names and encoding categorical variables)
df_full.columns = df_full.columns.str.replace('"', '').str.title()
df_small.columns = df_small.columns.str.replace('"', '').str.title()

In [82]:
# Encode categorical variables
label_encoders = {}
for column in df_full.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df_full[column] = label_encoders[column].fit_transform(df_full[column])
    df_small[column] = label_encoders[column].transform(df_small[column])

In [83]:
# Define features (X) and target (y)
X_full = df_full.drop('Y', axis=1)
y_full = df_full['Y']

In [84]:
X_small = df_small.drop('Y', axis=1)
y_small = df_small['Y']

In [86]:
# Split the full dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=42)

In [87]:
# Train the decision tree classifier on the full dataset
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [88]:
# Predict on the small dataset
y_pred = clf.predict(X_small)

In [89]:
# Evaluate the model on the small dataset
print("Accuracy on small dataset:", accuracy_score(y_small, y_pred))
print("Classification Report on small dataset:\n", classification_report(y_small, y_pred))
print("Confusion Matrix on small dataset:\n", confusion_matrix(y_small, y_pred))

Accuracy on small dataset: 0.962397699623977
Classification Report on small dataset:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4000
           1       0.83      0.84      0.84       521

    accuracy                           0.96      4521
   macro avg       0.91      0.91      0.91      4521
weighted avg       0.96      0.96      0.96      4521

Confusion Matrix on small dataset:
 [[3912   88]
 [  82  439]]
