In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score,classification_report,accuracy_score
import optuna
from optuna.samplers import TPESampler
%matplotlib inline

In [5]:
df=pd.read_csv("/content/data_sample.csv")
df.shape

(3079, 294)

In [6]:
df['target'] = np.where(df['ResponseRate'] == 0, 'B1',
                        np.where((df['ResponseRate'] > 0) & (df['ResponseRate'] < 0.15), 'B2',
                                 np.where(df['ResponseRate'] >= 0.15, 'B3', 'Not Specified')))

In [7]:
df['target'].value_counts()

B1    1635
B3     855
B2     589
Name: target, dtype: int64

In [8]:
# List the columns to remove
columns_to_remove = ['OfferHistoryID', 'ResponseRate']
# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_remove, inplace=True)

In [9]:
df=df.fillna(0)

In [10]:
df.shape

(3079, 293)

In [11]:
# Splitting the dataset into X and y
X = df.drop('target', axis=1)  # Features: all columns except 'target'
y = df['target']

In [12]:
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**HYPERPARAMETER_TUNING**

In [15]:
def objective(trial):
    params = {
        #"iterations":500,
        #"learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 3,10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1,10),
        "min_samples_split":trial.suggest_int("min_samples_split",2,10),
        "random_state": 42,
        # "subsample": trial.suggest_uniform('subsample', 0.5, 1.0),
        #"min_samples_leaf": 1,
    }

    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    f1 = f1_score(y_test, y_pred_test,average='weighted')
    return f1

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=200)

print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value)

[I 2024-04-02 12:45:47,468] A new study created in memory with name: no-name-0db46f1c-b045-45b2-a624-3c95074eb3a6
[I 2024-04-02 12:45:47,514] Trial 0 finished with value: 0.9439589171457409 and parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 8}. Best is trial 0 with value: 0.9439589171457409.
[I 2024-04-02 12:45:47,560] Trial 1 finished with value: 0.9342350668955507 and parameters: {'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 3}. Best is trial 0 with value: 0.9439589171457409.
[I 2024-04-02 12:45:47,599] Trial 2 finished with value: 0.9356675495945435 and parameters: {'max_depth': 3, 'min_samples_leaf': 9, 'min_samples_split': 7}. Best is trial 0 with value: 0.9439589171457409.
[I 2024-04-02 12:45:47,639] Trial 3 finished with value: 0.9283687019859084 and parameters: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 10}. Best is trial 0 with value: 0.9439589171457409.
[I 2024-04-02 12:45:47,679] Trial 4 finished with value: 0.91141

Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 9}
Best F1: 0.9471465266704984
