In [12]:
# ! pip install pandas
# ! pip install numpy
# ! pip install catboost
# ! pip install seaborn
# ! pip install scikit-learn

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import catboost
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score

In [2]:
import os
os.getcwd()

'/azusers/work/UNICA/Catboost'

In [3]:
df= pd.read_csv("/azusers/work/UNICA/Preprocessed_df.csv")
df.shape

(335113, 538)

### We have to make our dataset for binary classification

In [4]:
df['Response'].value_counts()

Response
0.00    317173
0.05     17138
0.20       738
1.05        64
Name: count, dtype: int64

In [5]:
df['target'] = np.where(df['Response'] == 0,0,1)

In [6]:
df.shape

(335113, 539)

In [7]:
df['target'].value_counts(normalize=True)

target
0    0.946466
1    0.053534
Name: proportion, dtype: float64

### Removing the unwanted coloums

In [8]:
# List the columns to remove
columns_to_remove = ['indiv_id', 'Response']

# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_remove, inplace=True)

In [9]:
df.shape

(335113, 537)

### Splitting the dataset into X and y

In [10]:
# Splitting the dataset into X and y
X = df.drop('target', axis=1)  # Features: all columns except 'target'
y = df['target']

In [11]:
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training

In [12]:
catboost_model = CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.01,
    max_depth= 5,
    colsample_bylevel= 0.9655000144869412,
    random_strength= 7.587945476302646, 
    bagging_temperature= 3.073560724145776,
    bootstrap_type ='Bayesian',
    eval_metric ='Logloss',
    random_state = 42
    )

catboost_model.fit(X_train, y_train, eval_set = [(X_test, y_test)],verbose = 10)

0:	learn: 0.6778849	test: 0.6779358	best: 0.6779358 (0)	total: 130ms	remaining: 2m 10s
10:	learn: 0.5268309	test: 0.5272485	best: 0.5272485 (10)	total: 997ms	remaining: 1m 29s
20:	learn: 0.4300052	test: 0.4305185	best: 0.4305185 (20)	total: 1.76s	remaining: 1m 22s
30:	learn: 0.3375692	test: 0.3380677	best: 0.3380677 (30)	total: 2.69s	remaining: 1m 24s
40:	learn: 0.2852832	test: 0.2857567	best: 0.2857567 (40)	total: 3.63s	remaining: 1m 24s
50:	learn: 0.2550473	test: 0.2554853	best: 0.2554853 (50)	total: 4.57s	remaining: 1m 25s
60:	learn: 0.2357676	test: 0.2361653	best: 0.2361653 (60)	total: 5.56s	remaining: 1m 25s
70:	learn: 0.2225763	test: 0.2229431	best: 0.2229431 (70)	total: 6.55s	remaining: 1m 25s
80:	learn: 0.2130701	test: 0.2134127	best: 0.2134127 (80)	total: 7.42s	remaining: 1m 24s
90:	learn: 0.2059906	test: 0.2063115	best: 0.2063115 (90)	total: 8.33s	remaining: 1m 23s
100:	learn: 0.2005559	test: 0.2008639	best: 0.2008639 (100)	total: 9.3s	remaining: 1m 22s
110:	learn: 0.1963376	

<catboost.core.CatBoostClassifier at 0x7a84ecbaeb00>

### changing the cutoff

In [21]:
y_pred_test = catboost_model.predict_proba(X_test)[:,1] > 0.085

In [22]:
y_pred_train = catboost_model.predict_proba(X_train)[:,1] > 0.085

In [13]:
# y_pred_test = catboost_model.predict(X_test)
# y_pred_train = catboost_model.predict(X_train)

### TRAIN

In [23]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[137872 115866]
 [  2121  12231]]


In [24]:
# Evaluating the model
accuracy = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on trn set: {accuracy}")

Accuracy on trn set: 0.559897795516431


In [25]:
# Evaluating the model
f1 = f1_score(y_train, y_pred_train)
print(f"f1 on train set: {f1}")

f1 on train set: 0.1717246172314302


### TEST

In [26]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[34268 29167]
 [  537  3051]]


In [27]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy on trn set: {accuracy}")

Accuracy on trn set: 0.5568088566611462


In [28]:
# Evaluating the model
f1 = f1_score(y_test, y_pred_test)
print(f"f1 on train set: {f1}")

f1 on train set: 0.17041836563704407


### JSON FORMAT

In [22]:
catboost_model.save_model(
    "catboost_model.json",
    format="json",
    # pool=pool  # this parameter is required only for models with categorical features.
)

In [19]:
# Predict probabilities for the positive class
y_prob_test = catboost_model.predict_proba(X_test)[:, 1]

In [20]:
# Calculate F1 scores for different threshold values
threshold_values = np.arange(0, 1.0, 0.001)
f1_scores = []
for threshold in threshold_values:
    y_pred_threshold = (y_prob_test > threshold).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_threshold))

# Find the threshold value with the highest F1 score
best_threshold = threshold_values[np.argmax(f1_scores)]
best_f1_score = np.max(f1_scores)

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1_score)

Best Threshold: 0.085
Best F1 Score: 0.17041836563704407
