# CatBoost - No Encoding 

In [2]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Reading Dataset and New Time Features

In [3]:
# Reading file (train_shuffled_with_columns.csv is result of shuffling the original train.csv file)
dataset = r'train_shuffled.csv' 
df = pd.read_csv(dataset,nrows=1000000)

# Rename column 'hour' to 'timestamp'
df.rename(columns={'hour':'timestamp'}, inplace=True)

# Convert 'timestamp' column to datetime format
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%y%m%d%H')

# Sorting rows per 'hour'
df = df.sort_values(by='timestamp')
df.head(2)

Unnamed: 0,id,click,timestamp,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
325876,15348754204581273148,0,2014-10-21,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15705,320,50,1722,0,35,-1,79
876704,8920252239684887372,0,2014-10-21,1005,0,85f751fd,c4e18dd6,50e219e0,e2fcccd2,5c5a694b,...,1,0,4687,320,50,423,2,39,100148,32


In [4]:
# Creating new features

# time related features
df['date'] = df['timestamp'].dt.strftime('%Y-%m-%d') # yyyy-mm-dd
df['day_of_week'] = df['timestamp'].dt.dayofweek # 0: monday... 6: sunday
df['hour'] = df['timestamp'].dt.hour # 0-23

# num_impressions_user_day: Number of impressions a user (device_id) has in a day
df['num_impressions_user_day'] = df.groupby(['date', 'device_id'])['id'].transform('count')

# time_interval_last_visit: Time interval from the last visit
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.sort_values(by=['device_id', 'timestamp'], inplace=True)
df['time_interval_last_visit'] = df.groupby('device_id')['timestamp'].diff().dt.total_seconds().fillna(0)

# num_days_user_appears: Number of days the user appeared
df['num_days_user_appears'] = df.groupby('device_id')['date'].transform('nunique')

# num_previous_clicks: number of clicks the user has done 
df['num_previous_clicks'] = df.groupby('device_id')['click'].cumsum() - df['click']

print('new features created')

new features created


In [5]:
# Dropping columns 'id' and 'timestamp'
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)
    print('column id was deleted')

if 'timestamp' in df.columns:
    df.drop(columns=['timestamp'], inplace=True)
    print('column timestamp was deleted')

column id was deleted
column timestamp was deleted


# Adressing imbalance problem: Downsampling majority class = 0

In [11]:
# Separate majority and minority classes
df_majority = df[df['click'] == 0]
df_minority = df[df['click'] == 1]

# Downsample majority class
df_majority_downsampled = df_majority.sample(n=len(df_minority), random_state=42)

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the DataFrame
df = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display new class counts
print(df['click'].value_counts())

click
1    169554
0    169554
Name: count, dtype: int64


In [34]:
# Summary of Number of unique values and data type per column
summary = pd.DataFrame(df.nunique(), columns=['num_unique'])
summary['dtype'] = df.dtypes
summary

Unnamed: 0,num_unique,dtype
click,2,int64
C1,7,int64
banner_pos,7,int64
site_id,2065,object
site_domain,2060,object
site_category,20,object
app_id,2089,object
app_domain,146,object
app_category,22,object
device_id,52611,object


In [35]:
df.shape

(339108, 29)

In [36]:
# Convert specified columns to string
df[['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
    'device_id', 'device_ip', 'device_model', 'date']] = df[['site_id', 'site_domain', 
    'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 
    'device_model', 'date']].astype(str)

In [37]:
# Summary of Number of unique values and data type per column
summary = pd.DataFrame(df.nunique(), columns=['num_unique'])
summary['dtype'] = df.dtypes
summary

Unnamed: 0,num_unique,dtype
click,2,int64
C1,7,int64
banner_pos,7,int64
site_id,2065,object
site_domain,2060,object
site_category,20,object
app_id,2089,object
app_domain,146,object
app_category,22,object
device_id,52611,object


In [38]:
# Separate features and target
X = df.drop(columns=['click'])  # Features
y = df['click']  # Target

In [39]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2, random_state=42)

# CatBoost

In [41]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, confusion_matrix, f1_score

# Specify categorical features
cat_features = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
                'device_id', 'device_ip', 'device_model', 'date']

# Initialize CatBoost classifier
catboost_classifier = CatBoostClassifier(iterations=100, random_state=42)

# Train the model
catboost_classifier.fit(X_train, y_train, cat_features=cat_features)

# Predict probabilities
y_prob_catboost = catboost_classifier.predict_proba(X_test)

# Compute log loss
logloss_catboost = log_loss(y_test, y_prob_catboost)
print("Log Loss (CatBoost):", logloss_catboost)

# Confusion Matrix
y_pred_catboost = catboost_classifier.predict(X_test)
cm_catboost = confusion_matrix(y_test, y_pred_catboost)
print("Confusion Matrix (CatBoost):")
print(cm_catboost)

# Compute TPR and FPR
TN_catboost, FP_catboost, FN_catboost, TP_catboost = cm_catboost.ravel()
TPR_catboost = TP_catboost / (TP_catboost + FN_catboost)
FPR_catboost = FP_catboost / (FP_catboost + TN_catboost)
print("True Positive Rate (TPR) (CatBoost):", TPR_catboost)
print("False Positive Rate (FPR) (CatBoost):", FPR_catboost)

# Compute F1-Score
f1_catboost = f1_score(y_test, y_pred_catboost)
print("F1-Score (CatBoost):", f1_catboost)



Learning rate set to 0.5
0:	learn: 0.6334955	total: 845ms	remaining: 1m 23s
1:	learn: 0.6153420	total: 1.35s	remaining: 1m 6s
2:	learn: 0.6102628	total: 1.84s	remaining: 59.7s
3:	learn: 0.6072790	total: 2.3s	remaining: 55.3s
4:	learn: 0.6054784	total: 2.71s	remaining: 51.5s
5:	learn: 0.6044223	total: 3.09s	remaining: 48.4s
6:	learn: 0.6033677	total: 3.44s	remaining: 45.7s
7:	learn: 0.6025899	total: 3.75s	remaining: 43.1s
8:	learn: 0.6019713	total: 4.04s	remaining: 40.9s
9:	learn: 0.6011522	total: 4.34s	remaining: 39.1s
10:	learn: 0.6003079	total: 4.64s	remaining: 37.6s
11:	learn: 0.5999559	total: 4.95s	remaining: 36.3s
12:	learn: 0.5996101	total: 5.26s	remaining: 35.2s
13:	learn: 0.5993105	total: 5.57s	remaining: 34.2s
14:	learn: 0.5992414	total: 5.77s	remaining: 32.7s
15:	learn: 0.5986399	total: 6.1s	remaining: 32s
16:	learn: 0.5980242	total: 6.38s	remaining: 31.2s
17:	learn: 0.5977399	total: 6.7s	remaining: 30.5s
18:	learn: 0.5974903	total: 7s	remaining: 29.8s
19:	learn: 0.5974053	to

# Feature Importance

In [49]:
# Get feature importance
feature_importance = catboost_classifier.feature_importances_

# Create a dictionary to store feature names and their importance scores
feature_importance_dict = {feature_name: importance for feature_name, importance in zip(X_train.columns, feature_importance)}

# Sort the dictionary by importance score in descending order
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print feature importance in descending order
for feature_name, importance in sorted_feature_importance.items():
    print(f"Feature: {feature_name}, Importance: {importance}")

Feature: site_id, Importance: 29.387234471158703
Feature: app_id, Importance: 20.330855649178005
Feature: device_model, Importance: 7.00264826797345
Feature: C21, Importance: 5.304405046885723
Feature: site_domain, Importance: 5.054924817872717
Feature: C14, Importance: 4.1027102138210125
Feature: C19, Importance: 3.2570251581467216
Feature: num_previous_clicks, Importance: 2.963974711746182
Feature: hour, Importance: 2.3993676743566725
Feature: C17, Importance: 2.193279511836151
Feature: device_ip, Importance: 1.918437463809518
Feature: app_category, Importance: 1.8926985662201163
Feature: C16, Importance: 1.7528474958274973
Feature: C20, Importance: 1.708839462079513
Feature: app_domain, Importance: 1.6057134781517424
Feature: site_category, Importance: 1.460314733350169
Feature: device_id, Importance: 1.138778121079201
Feature: banner_pos, Importance: 1.050763804087056
Feature: time_interval_last_visit, Importance: 1.0446908682951306
Feature: C18, Importance: 1.0426742309407635
Feat