In [1]:
import numpy as np
import pandas as pd
import catboost
import imblearn
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import matplotlib.pyplot as plt
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import matthews_corrcoef

In [2]:
df_org = pd.read_csv('Top_df.csv')
df_org.shape

(335113, 21)

In [3]:
df_org = df_org.drop_duplicates()

In [4]:
df_org.shape

(3263, 21)

In [5]:
df= df_org[['ContactMonth', 'ContactHour', 'os_112', 'os_110', 's_247_71', 's_239_8', 's_247_75', 's_239_7', 's_248_74', 's_247_77', 's_239_9', 's_241_68', 's_241_69', 's_247_78','s_239_10', 's_247_73','target']]

In [6]:
df.shape

(3263, 17)

In [7]:
df_0 = df[df['target'] == 0]
df_1 = df[df['target'] == 1]

In [8]:
# Find the size of the smaller group
min_size = min(len(df_0), len(df_1))

In [9]:
df_0_sampled = df_0.sample(n=min_size, random_state=42)
df_1_sampled = df_1.sample(n=min_size, random_state=42)


In [10]:
# Combine the two datasets back
df_balanced = pd.concat([df_0_sampled, df_1_sampled])

In [11]:
df_balanced.shape

(1964, 17)

In [12]:
# Assuming df is your original DataFrame
# Identify the indices of the downsampled df_0_sampled
sampled_indices = df_0_sampled.index

# Filter the original df_0 to get the rows that were not included in the downsampled subset
df_0_remaining = df_0.drop(sampled_indices)

# df_0_remaining now contains all the rows with target 0 that were not part of the balanced dataset


In [13]:
df_0_remaining.shape

(1299, 17)

In [14]:
# Assuming df is your DataFrame containing the remaining values of target 0
X_remaining = df_0_remaining.drop('target', axis=1)  # Features
y_remaining = df_0_remaining['target'] 

In [27]:
# for column in X_remaining.columns:
#     # Convert to string, including NaNs which become 'nan'
#     X_remaining[column] = X_remaining[column].astype(str)

#     # Optional: Replace 'nan' with a placeholder such as 'missing'
#     X_remaining[column] = X_remaining[column].replace('nan', 'missing')

In [12]:
# df_0_remaining.to_csv('remaining.csv', index=False)

In [15]:
df['target'].value_counts()

target
0    2281
1     982
Name: count, dtype: int64

In [16]:
df['target'].value_counts(normalize=True)

target
0    0.69905
1    0.30095
Name: proportion, dtype: float64

In [430]:
# # Splitting the dataset into X and y
# X = df.drop('target', axis=1)  # Features: all columns except 'target'
# y = df['target']

In [9]:
# # Separate majority and minority classes
# df_majority = df[df.target==0]
# df_minority = df[df.target==1]

In [10]:
# df_minority.shape

(17940, 17)

In [11]:
# df_majority.shape

(317173, 17)

In [12]:
# # Downsample majority class
# df_majority_downsampled = df_majority.sample(n=17940, replace=True, random_state=123)

In [13]:
# df_majority_downsampled.shape

(17940, 17)

In [14]:
# # Combine minority class with downsampled majority class
# df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [15]:
# df_downsampled.shape

(35880, 17)

In [17]:
# Splitting the dataset into X and y
X = df_balanced.drop('target', axis=1)  # Features: all columns except 'target'
y = df_balanced['target']

In [18]:
df_balanced['target'].value_counts()

target
0    982
1    982
Name: count, dtype: int64

In [19]:
df_balanced['target'].value_counts(normalize=True)

target
0    0.5
1    0.5
Name: proportion, dtype: float64

In [20]:
categorical_features = ['ContactMonth', 'ContactHour', 'os_112', 'os_110', 's_247_71',
       's_239_8', 's_247_75', 's_239_7', 's_248_74', 's_247_77', 's_239_9',
       's_241_68', 's_241_69', 's_247_78', 's_239_10', 's_247_73']

In [21]:
for column in categorical_features:
    # Convert to string, including NaNs which become 'nan'
    X[column] = X[column].astype(str)

    # Optional: Replace 'nan' with a placeholder such as 'missing'
    X[column] = X[column].replace('nan', 'missing')

In [22]:
# # Splitting dataset into training and testing set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Split the dataset into 70% training and 30% temporary test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary test set into 50% test and 50% validation, which makes it 15% of the original dataset each
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [24]:
catboost_model = CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.01,
    reg_lambda=0.5,
    eval_metric='Logloss',
    max_depth = 9,
    colsample_bylevel =  0.5872678,
    random_strength = 2.24931,
    random_state = 42,
    bootstrap_type = 'Bayesian',
    bagging_temperature= 1.867636,
    auto_class_weights = 'Balanced'
    )

catboost_model.fit(X_train,y_train,eval_set = [(X_test, y_test)],cat_features=categorical_features, use_best_model=True, early_stopping_rounds=50)

0:	learn: 0.6872087	test: 0.6869422	best: 0.6869422 (0)	total: 69.1ms	remaining: 1m 9s
1:	learn: 0.6805121	test: 0.6795092	best: 0.6795092 (1)	total: 72ms	remaining: 35.9s
2:	learn: 0.6732827	test: 0.6713210	best: 0.6713210 (2)	total: 73.4ms	remaining: 24.4s
3:	learn: 0.6676943	test: 0.6654482	best: 0.6654482 (3)	total: 84.7ms	remaining: 21.1s
4:	learn: 0.6599730	test: 0.6567030	best: 0.6567030 (4)	total: 86ms	remaining: 17.1s
5:	learn: 0.6537614	test: 0.6498134	best: 0.6498134 (5)	total: 87.7ms	remaining: 14.5s
6:	learn: 0.6490394	test: 0.6451737	best: 0.6451737 (6)	total: 92ms	remaining: 13.1s
7:	learn: 0.6425629	test: 0.6378265	best: 0.6378265 (7)	total: 93.4ms	remaining: 11.6s
8:	learn: 0.6374073	test: 0.6321471	best: 0.6321471 (8)	total: 96.6ms	remaining: 10.6s
9:	learn: 0.6311815	test: 0.6250638	best: 0.6250638 (9)	total: 97.9ms	remaining: 9.7s
10:	learn: 0.6261564	test: 0.6194797	best: 0.6194797 (10)	total: 110ms	remaining: 9.92s
11:	learn: 0.6217183	test: 0.6143721	best: 0.6143

<catboost.core.CatBoostClassifier at 0x7d11a06a19c0>

In [28]:
y_pred_test = catboost_model.predict(X_test)
y_pred_train = catboost_model.predict(X_train)
y_pred_val = catboost_model.predict(X_val)
y_pred_rem = catboost_model.predict(X_remaining)

### TRAIN

In [29]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[317 367]
 [ 10 680]]


In [30]:
# Evaluating the model
accuracy = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on trn set: {accuracy}")

Accuracy on trn set: 0.7256186317321689


In [31]:
# Evaluating the model
f1 = f1_score(y_train, y_pred_train)
print(f"f1 on train set: {f1}")

f1 on train set: 0.7829591249280369


In [32]:
# Evaluating the model
f1 = f1_score(y_train, y_pred_train,average='weighted')
print(f"f1 on train set: {f1}")

f1 on train set: 0.7053708017906012


In [33]:
matthews_corrcoef(y_train, y_pred_train)

0.5271213994827005

### TEST

In [34]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 77  72]
 [  2 144]]


In [35]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.7491525423728813


In [36]:
# Evaluating the model
f1 = f1_score(y_test, y_pred_test)
print(f"f1 on test set: {f1}")

f1 on test set: 0.7955801104972375


In [37]:
# Evaluating the model
f1 = f1_score(y_test, y_pred_test,average='weighted')
print(f"f1 on test set: {f1}")

f1 on test set: 0.734898464439965


In [38]:
matthews_corrcoef(y_test, y_pred_test)

0.5680232352086584

### Validation

In [39]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 78  71]
 [  2 144]]


In [40]:
# Evaluating the model
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Accuracy on val set: {accuracy}")

Accuracy on val set: 0.752542372881356


In [41]:
# Evaluating the model
f1 = f1_score(y_val, y_pred_val)
print(f"f1 on val set: {f1}")

f1 on val set: 0.7977839335180056


In [42]:
# Evaluating the model
f1 = f1_score(y_val, y_pred_val,average='weighted')
print(f"f1 on val set: {f1}")

f1 on val set: 0.7389106362703131


In [43]:
matthews_corrcoef(y_val, y_pred_val)

0.5733208823653075

### Remaining train

In [44]:
confusion_matrix(y_remaining, y_pred_rem)

array([[613, 686],
       [  0,   0]])

In [45]:
accuracy_score(y_remaining, y_pred_rem)

0.4719014626635874

In [46]:
f1_score(y_remaining, y_pred_rem,average='weighted')

0.641213389121339

In [47]:
f1_score(y_remaining, y_pred_rem)

0.0

In [48]:
matthews_corrcoef(y_remaining, y_pred_rem)

0.0

In [38]:
# # Predict probabilities for the positive class
# y_prob_test = catboost_model.predict_proba(X_test)[:, 1]

In [39]:
# # Calculate F1 scores for different threshold values
# threshold_values = np.arange(0, 1.0, 0.001)
# f1_scores = []
# for threshold in threshold_values:
#     y_pred_threshold = (y_prob_test > threshold).astype(int)
#     f1_scores.append(f1_score(y_test, y_pred_threshold))

# # Find the threshold value with the highest F1 score
# best_threshold = threshold_values[np.argmax(f1_scores)]
# best_f1_score = np.max(f1_scores)

# print("Best Threshold:", best_threshold)
# print("Best F1 Score:", best_f1_score)

Best Threshold: 0.107
Best F1 Score: 0.7776890864658422


In [466]:
def objective(trial):
    params = {
        "iterations":500,
        #"logging_level":'silent',
        #"verbose":0,
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 3,10),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20,100),
        "random_state": 42,
        # "subsample": trial.suggest_uniform('subsample', 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 5.0),
        "bootstrap_type": 'Bayesian',
        "eval_metric":'Logloss',
        "auto_class_weights":'Balanced'
    }

    model = catboost.CatBoostClassifier(**params, silent=True)
    model.fit(X_train, y_train)
    # model.fit(pool)
    y_pred_test = model.predict(X_test)
    f1 = f1_score(y_test, y_pred_test)
    # y_pred_val = model.predict(X_val)
    # f1 = f1_score(y_val, y_pred_val)
    return f1

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=200)
print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value)

[I 2024-04-10 14:52:17,763] A new study created in memory with name: no-name-e0e9c395-4134-4344-b2c7-311f9b9c6c0c
[I 2024-04-10 14:52:19,095] Trial 0 finished with value: 0.773157441927158 and parameters: {'max_depth': 5, 'colsample_bylevel': 0.9655000144869412, 'random_strength': 7.587945476302646, 'bagging_temperature': 3.0334265725654794}. Best is trial 0 with value: 0.773157441927158.
[I 2024-04-10 14:52:20,267] Trial 1 finished with value: 0.773157441927158 and parameters: {'max_depth': 4, 'colsample_bylevel': 0.40919616423534183, 'random_strength': 1.5227525095137953, 'bagging_temperature': 4.344263114297182}. Best is trial 0 with value: 0.773157441927158.
[I 2024-04-10 14:52:21,985] Trial 2 finished with value: 0.7728256166161834 and parameters: {'max_depth': 7, 'colsample_bylevel': 0.7956508044572318, 'random_strength': 1.185260448662222, 'bagging_temperature': 4.852558275593772}. Best is trial 0 with value: 0.773157441927158.
[I 2024-04-10 14:52:23,630] Trial 3 finished with v

Best hyperparameters: {'max_depth': 9, 'colsample_bylevel': 0.5872678982438706, 'random_strength': 2.249315537973288, 'bagging_temperature': 1.8676359039328798}
Best F1: 0.7738043946574752
