In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Generate a synthetic dataset
np.random.seed(42)

# Simulate features
n_samples = 1000
data = pd.DataFrame({
    "age": np.random.randint(18, 70, size=n_samples),  # Age between 18 and 70
    "job": np.random.choice(["admin.", "technician", "blue-collar", "retired", "student"], size=n_samples),
    "marital": np.random.choice(["married", "single", "divorced"], size=n_samples),
    "education": np.random.choice(["primary", "secondary", "tertiary", "unknown"], size=n_samples),
    "default": np.random.choice(["yes", "no"], size=n_samples),
    "balance": np.random.randint(-2000, 5000, size=n_samples),  # Account balance
    "housing": np.random.choice(["yes", "no"], size=n_samples),
    "loan": np.random.choice(["yes", "no"], size=n_samples),
    "contact": np.random.choice(["cellular", "telephone", "unknown"], size=n_samples),
    "day": np.random.randint(1, 31, size=n_samples),  # Day of the month
    "month": np.random.choice(["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"], size=n_samples),
    "duration": np.random.randint(0, 5000, size=n_samples),  # Duration of the last call
    "campaign": np.random.randint(1, 10, size=n_samples),  # Number of contacts performed
    "previous": np.random.randint(0, 5, size=n_samples),  # Number of contacts performed before
    "poutcome": np.random.choice(["success", "failure", "unknown"], size=n_samples),
    "y": np.random.choice(["yes", "no"], size=n_samples)  # Target: subscribed or not
})

# Step 2: Preprocess the Data
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define features and target
X = data.drop("y", axis=1)  # Features
y = data["y"]  # Target

# Step 3: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 5: Evaluate the Model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.47
Confusion Matrix:
 [[70 82]
 [77 71]]
Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.46      0.47       152
           1       0.46      0.48      0.47       148

    accuracy                           0.47       300
   macro avg       0.47      0.47      0.47       300
weighted avg       0.47      0.47      0.47       300

