In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import catboost
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score,classification_report,accuracy_score

In [2]:
df=pd.read_csv("/content/data_sample.csv")
df.shape

(3079, 294)

In [3]:
df['target'] = np.where(df['ResponseRate'] == 0, 'B1',
                        np.where((df['ResponseRate'] > 0) & (df['ResponseRate'] < 0.15), 'B2',
                                 np.where(df['ResponseRate'] >= 0.15, 'B3', 'Not Specified')))

In [4]:
df['target'].nunique()

3

In [5]:
df['target'].value_counts()

B1    1635
B3     855
B2     589
Name: target, dtype: int64

In [6]:
df.head()

Unnamed: 0,OfferHistoryID,s_239_7,s_239_8,s_239_9,s_239_10,s_241_11,s_241_12,s_241_13,s_241_14,s_241_15,...,ot_108~RECENT,ot_108~Subject: Auto,ot_108~Subject: Banking,ot_108~Subject: Retail,ot_108~Subject: Travel,ot_108~Subject: Undetermined,ot_108~Tags,ot_108~VOUCHER,ot_108~race,target
0,76,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B3
1,76,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B3
2,76,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B3
3,76,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B3
4,76,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B3


In [9]:
# List the columns to remove
columns_to_remove = ['OfferHistoryID', 'ResponseRate']
# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_remove, inplace=True)

In [10]:
df.shape

(3079, 293)

In [14]:
# Splitting the dataset into X and y
X = df.drop('target', axis=1)  # Features: all columns except 'target'
y = df['target']

In [16]:
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
catboost_model = CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.05,
    reg_lambda=1.5,
    eval_metric='MultiClass',
    max_depth = 4,
    colsample_bylevel = 0.90,
    random_strength = 1,
    bagging_temperature = 0.6,
    random_state = 42,
    )

catboost_model.fit(X_train, y_train, eval_set = [(X_test, y_test)],verbose = 10)

0:	learn: 1.0196277	test: 1.0194034	best: 1.0194034 (0)	total: 53.4ms	remaining: 53.3s
10:	learn: 0.5827347	test: 0.5810908	best: 0.5810908 (10)	total: 104ms	remaining: 9.39s
20:	learn: 0.3917844	test: 0.3911048	best: 0.3911048 (20)	total: 161ms	remaining: 7.49s
30:	learn: 0.2937509	test: 0.2935095	best: 0.2935095 (30)	total: 213ms	remaining: 6.66s
40:	learn: 0.2398510	test: 0.2408694	best: 0.2408694 (40)	total: 268ms	remaining: 6.26s
50:	learn: 0.2029677	test: 0.2065770	best: 0.2065770 (50)	total: 317ms	remaining: 5.9s
60:	learn: 0.1862732	test: 0.1920811	best: 0.1920811 (60)	total: 366ms	remaining: 5.63s
70:	learn: 0.1740904	test: 0.1804374	best: 0.1804374 (70)	total: 413ms	remaining: 5.4s
80:	learn: 0.1635071	test: 0.1715787	best: 0.1715787 (80)	total: 462ms	remaining: 5.25s
90:	learn: 0.1571147	test: 0.1652500	best: 0.1652500 (90)	total: 511ms	remaining: 5.11s
100:	learn: 0.1515305	test: 0.1609724	best: 0.1609724 (100)	total: 564ms	remaining: 5.02s
110:	learn: 0.1460424	test: 0.156

<catboost.core.CatBoostClassifier at 0x7fd0d08f9b70>

In [18]:
# Evaluating the model
accuracy = catboost_model.score(X_test, y_test)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.9334415584415584


In [19]:
y_pred_test = catboost_model.predict(X_test)

In [20]:
y_pred_train = catboost_model.predict(X_train)

In [21]:
def evaluate_metrics(df):
    results = {'Label': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

    for label in df['label'].unique():
        true_values = df['label'] == label
        predicted_values = df['pred'] == label

        accuracy = accuracy_score(true_values, predicted_values)
        report = classification_report(true_values, predicted_values, output_dict=True)['True']


        results['Label'].append(label)
        results['Accuracy'].append(accuracy)
        results['Precision'].append(report['precision'])
        results['Recall'].append(report['recall'])
        results['F1 Score'].append(report['f1-score'])
#         results = results.sort_values(['Label'].astype(int))

    # Overall metrics
    overall_accuracy = accuracy_score(df['label'], df['pred'])
    overall_report = classification_report(df['label'], df['pred'], output_dict=True)['weighted avg']

    results['Label'].append(999)
    results['Accuracy'].append(overall_accuracy)
    results['Precision'].append(overall_report['precision'])
    results['Recall'].append(overall_report['recall'])
    results['F1 Score'].append(overall_report['f1-score'])

    kk = pd.DataFrame(results)
#     kk['Label'] = kk['Label'].astype(int)
#     kk = kk.sort_values(['Label'])
    kk['Label'] = kk['Label'].replace({999:'Overall'})
    return kk

**TEST**

In [25]:
# Combine the two lists into a DataFrame
data = list(zip(list(y_test), y_pred_test.flatten().tolist()))
df = pd.DataFrame({
    'label': y_test,
    'pred': y_pred_test.flatten()  # This ensures y_pred_test is 1-dimensional
})

In [26]:
evaluate_metrics(df)

Unnamed: 0,Label,Accuracy,Precision,Recall,F1 Score
0,B3,0.933442,0.860104,0.922222,0.89008
1,B1,0.965909,0.974763,0.959627,0.967136
2,B2,0.967532,0.943396,0.877193,0.909091
3,Overall,0.933442,0.935454,0.933442,0.933878


**TRAIN**

In [29]:
# Combine the two lists into a DataFrame
data = list(zip(list(y_train), y_pred_train.flatten().tolist()))
df = pd.DataFrame({
    'label': y_train,
    'pred': y_pred_train.flatten()  # This ensures y_pred_test is 1-dimensional
})

In [30]:
evaluate_metrics(df)

Unnamed: 0,Label,Accuracy,Precision,Recall,F1 Score
0,B1,0.976045,0.98908,0.965727,0.977264
1,B3,0.956963,0.894591,0.955556,0.924069
2,B2,0.980918,0.965217,0.934737,0.949733
3,Overall,0.956963,0.958582,0.956963,0.957376
