In [1]:
import numpy as np
import pandas as pd
import catboost
import imblearn
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import matplotlib.pyplot as plt
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import matthews_corrcoef

In [2]:
df= pd.read_csv('Top_df.csv')
df.shape

(335113, 21)

In [3]:
# Selecting columns with data type 'object' (string) for df1
df1 = df.select_dtypes(include='object')

# Selecting columns with data type 'int64' for df2
df2 = df.select_dtypes(include=['int64','float64'])

In [4]:
# Convert all columns to categorical data type
df2 = df2.astype('category')

In [5]:
df2['target'] = df2['target'].astype('int')

### TF-IDF

In [6]:
# List of columns to transform with TF-IDF
columns_to_transform = ['os_105','os_108', 'os_109', 'os_111']

# Concatenate the text from these columns into a single series of strings
# Each row will contain the concatenated text from all specified columns
text_data = df1[columns_to_transform].apply(lambda x: ' '.join(x.dropna()), axis=1)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_features=500,max_df=0.99, min_df=0.01)

# Fit and transform the concatenated text data to get the TF-IDF features
tfidf_matrix = vectorizer.fit_transform(text_data)

In [7]:
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [8]:
df2.shape,tfidf_df.shape

((335113, 17), (335113, 193))

In [9]:
# Concatenate the dataframes
Top_df = pd.concat([df2.reset_index(drop=True),tfidf_df.reset_index(drop=True)], axis=1)

In [10]:
Top_df.shape

(335113, 210)

In [11]:
# Splitting the dataset into X and y
X = Top_df.drop('target', axis=1)  # Features: all columns except 'target'
y = Top_df['target']

In [12]:
df_0 = Top_df[df['target'] == 0]
df_1 = Top_df[df['target'] == 1]

In [13]:
# Find the size of the smaller group
min_size = min(len(df_0), len(df_1))

In [14]:
df_0_sampled = df_0.sample(n=min_size, random_state=42)
df_1_sampled = df_1.sample(n=min_size, random_state=42)

In [15]:
# Combine the two datasets back
df_balanced = pd.concat([df_0_sampled, df_1_sampled])

In [16]:
df_balanced.shape

(35880, 210)

In [17]:
# Assuming df is your original DataFrame
# Identify the indices of the downsampled df_0_sampled
sampled_indices = df_0_sampled.index

# Filter the original df_0 to get the rows that were not included in the downsampled subset
df_0_remaining = df_0.drop(sampled_indices)

# df_0_remaining now contains all the rows with target 0 that were not part of the balanced dataset


In [18]:
df_0_remaining.shape

(299233, 210)

In [19]:
# Assuming df is your DataFrame containing the remaining values of target 0
X_remaining = df_0_remaining.drop('target', axis=1)  # Features
y_remaining = df_0_remaining['target'] 

In [65]:
# # Splitting the dataset into X and y
# X = df.drop('target', axis=1)  # Features: all columns except 'target'
# y = df['target']

In [66]:
# # Separate majority and minority classes
# df_majority = df[df.target==0]
# df_minority = df[df.target==1]

In [67]:
# df_minority.shape

In [68]:
# df_majority.shape

In [69]:
# # Downsample majority class
# df_majority_downsampled = df_majority.sample(n=17940, replace=True, random_state=123)

In [70]:
# df_majority_downsampled.shape

In [71]:
# # Combine minority class with downsampled majority class
# df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [72]:
# df_downsampled.shape

In [22]:
# Splitting the dataset into X and y
X = df_balanced.drop('target', axis=1)  # Features: all columns except 'target'
y = df_balanced['target']

In [23]:
df_balanced['target'].value_counts()

target
0    17940
1    17940
Name: count, dtype: int64

In [24]:
df_balanced['target'].value_counts(normalize=True)

target
0    0.5
1    0.5
Name: proportion, dtype: float64

In [27]:
# Split the dataset into 70% training and 30% temporary test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary test set into 50% test and 50% validation, which makes it 15% of the original dataset each
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [59]:
X_train.shape,X_test.shape,X_val.shape

((25116, 209), (5382, 209), (5382, 209))

In [28]:
categorical_features = ['ContactMonth', 'ContactHour', 'os_112', 'os_110', 's_247_71',
       's_239_8', 's_247_75', 's_239_7', 's_248_74', 's_247_77', 's_239_9',
       's_241_68', 's_241_69', 's_247_78', 's_239_10', 's_247_73']

In [50]:
for feature in categorical_features:
    X_train[feature] = X_train[feature].astype(str)
    X_test[feature] = X_test[feature].astype(str)
    X_val[feature] = X_val[feature].astype(str)
    X_remaining[feature] = X_remaining[feature].astype(str)

In [30]:
catboost_model = CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.01,
    reg_lambda=0.5,
    eval_metric='Logloss',
    max_depth = 9,
    colsample_bylevel =  0.56662,
    random_strength = 5.5036,
    random_state = 42,
    bootstrap_type = 'Bayesian',
    bagging_temperature= 4.82883,
    auto_class_weights = 'Balanced'
    )

catboost_model.fit(X_train,y_train,eval_set = [(X_test, y_test)],cat_features=categorical_features, use_best_model=True, early_stopping_rounds=50)

0:	learn: 0.6839483	test: 0.6842364	best: 0.6842364 (0)	total: 80.5ms	remaining: 1m 20s
1:	learn: 0.6751177	test: 0.6756032	best: 0.6756032 (1)	total: 104ms	remaining: 52s
2:	learn: 0.6673512	test: 0.6680688	best: 0.6680688 (2)	total: 128ms	remaining: 42.5s
3:	learn: 0.6593070	test: 0.6602389	best: 0.6602389 (3)	total: 151ms	remaining: 37.6s
4:	learn: 0.6521301	test: 0.6532659	best: 0.6532659 (4)	total: 174ms	remaining: 34.6s
5:	learn: 0.6452184	test: 0.6465314	best: 0.6465314 (5)	total: 197ms	remaining: 32.6s
6:	learn: 0.6385963	test: 0.6400760	best: 0.6400760 (6)	total: 220ms	remaining: 31.3s
7:	learn: 0.6324013	test: 0.6340582	best: 0.6340582 (7)	total: 243ms	remaining: 30.2s
8:	learn: 0.6264905	test: 0.6283297	best: 0.6283297 (8)	total: 268ms	remaining: 29.5s
9:	learn: 0.6206612	test: 0.6226524	best: 0.6226524 (9)	total: 293ms	remaining: 29s
10:	learn: 0.6151982	test: 0.6173404	best: 0.6173404 (10)	total: 318ms	remaining: 28.6s
11:	learn: 0.6101541	test: 0.6124269	best: 0.6124269 (

<catboost.core.CatBoostClassifier at 0x7968988a8430>

In [51]:
y_pred_test = catboost_model.predict(X_test)
y_pred_train = catboost_model.predict(X_train)
y_pred_val = catboost_model.predict(X_val)
y_pred_rem = catboost_model.predict(X_remaining)

### TRAIN

In [32]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 5501  7055]
 [   86 12474]]


In [33]:
# Evaluating the model
accuracy = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on trn set: {accuracy}")

Accuracy on trn set: 0.7156792482879439


In [34]:
# Evaluating the model
f1 = f1_score(y_train, y_pred_train)
print(f"f1 on train set: {f1}")

f1 on train set: 0.7774626819159214


In [35]:
# Evaluating the model
f1 = f1_score(y_train, y_pred_train,average='weighted')
print(f"f1 on train set: {f1}")

f1 on train set: 0.6919472993845996


In [36]:
matthews_corrcoef(y_train, y_pred_train)

0.518490120229725

### TEST

In [37]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1137 1561]
 [  33 2651]]


In [38]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.7038275733927908


In [39]:
# Evaluating the model
f1 = f1_score(y_test, y_pred_test)
print(f"f1 on test set: {f1}")

f1 on test set: 0.7688515081206496


In [40]:
# Evaluating the model
f1 = f1_score(y_test, y_pred_test,average='weighted')
print(f"f1 on test set: {f1}")

f1 on test set: 0.6781407656719614


In [42]:
matthews_corrcoef(y_test, y_pred_test)

0.4959461527897328

### Validation

In [43]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_val, y_pred_val)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1201 1485]
 [  34 2662]]


In [44]:
# Evaluating the model
accuracy = accuracy_score(y_val, y_pred_val)
print(f"Accuracy on val set: {accuracy}")

Accuracy on val set: 0.7177629134150874


In [45]:
# Evaluating the model
f1 = f1_score(y_val, y_pred_val)
print(f"f1 on val set: {f1}")

f1 on val set: 0.7780213356714891


In [52]:
# Evaluating the model
f1 = f1_score(y_val, y_pred_val,average='weighted')
print(f"f1 on val set: {f1}")

f1 on val set: 0.6954637625112263


In [53]:
matthews_corrcoef(y_val, y_pred_val)

0.5166830673634507

### Remaining train

In [54]:
confusion_matrix(y_remaining, y_pred_rem)

array([[131178, 168055],
       [     0,      0]])

In [55]:
accuracy_score(y_remaining, y_pred_rem)

0.4383807935622074

In [56]:
f1_score(y_remaining, y_pred_rem,average='weighted')

0.6095476184391199

In [57]:
f1_score(y_remaining, y_pred_rem)

0.0

In [58]:
matthews_corrcoef(y_remaining, y_pred_rem)

0.0

In [104]:
# # Predict probabilities for the positive class
# y_prob_test = catboost_model.predict_proba(X_test)[:, 1]

In [105]:
# # Calculate F1 scores for different threshold values
# threshold_values = np.arange(0, 1.0, 0.001)
# f1_scores = []
# for threshold in threshold_values:
#     y_pred_threshold = (y_prob_test > threshold).astype(int)
#     f1_scores.append(f1_score(y_test, y_pred_threshold))

# # Find the threshold value with the highest F1 score
# best_threshold = threshold_values[np.argmax(f1_scores)]
# best_f1_score = np.max(f1_scores)

# print("Best Threshold:", best_threshold)
# print("Best F1 Score:", best_f1_score)

Best Threshold: 0.51
Best F1 Score: 0.7727534562211982


In [81]:
def objective(trial):
    params = {
        "iterations":500,
        #"logging_level":'silent',
        #"verbose":0,
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 3,10),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20,100),
        "random_state": 42,
        # "subsample": trial.suggest_uniform('subsample', 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 5.0),
        "bootstrap_type": 'Bayesian',
        "eval_metric":'Logloss',
        "auto_class_weights":'Balanced'
    }

    model = catboost.CatBoostClassifier(**params, silent=True)
    model.fit(X_train, y_train)
    # model.fit(pool)
    y_pred_test = model.predict(X_test)
    f1 = f1_score(y_test, y_pred_test)
    # y_pred_val = model.predict(X_val)
    # f1 = f1_score(y_val, y_pred_val)
    return f1

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=200)
print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value)

[I 2024-04-15 12:29:56,326] A new study created in memory with name: no-name-ba8644ea-639e-4b54-84f5-c65b9e1a57a1
[I 2024-04-15 12:30:03,236] Trial 0 finished with value: 0.7724852496762125 and parameters: {'max_depth': 5, 'colsample_bylevel': 0.9655000144869412, 'random_strength': 7.587945476302646, 'bagging_temperature': 3.0334265725654794}. Best is trial 0 with value: 0.7724852496762125.
[I 2024-04-15 12:30:09,857] Trial 1 finished with value: 0.7724852496762125 and parameters: {'max_depth': 4, 'colsample_bylevel': 0.40919616423534183, 'random_strength': 1.5227525095137953, 'bagging_temperature': 4.344263114297182}. Best is trial 0 with value: 0.7724852496762125.
[I 2024-04-15 12:30:19,886] Trial 2 finished with value: 0.77264218862491 and parameters: {'max_depth': 7, 'colsample_bylevel': 0.7956508044572318, 'random_strength': 1.185260448662222, 'bagging_temperature': 4.852558275593772}. Best is trial 2 with value: 0.77264218862491.
[I 2024-04-15 12:30:28,618] Trial 3 finished with 

Best hyperparameters: {'max_depth': 9, 'colsample_bylevel': 0.5666214856186601, 'random_strength': 5.50365970829216, 'bagging_temperature': 4.8288339251486985}
Best F1: 0.7727534562211982
