# CatBoost Algorithm

CatBoost is a state-of-the-art open-source gardient boosting on decision trees library. It's simple and easy to use. Adn is now regularly one of the top algorithms used in data science competitions as it produces very good results without extensive data clean-up or feature engineering

In [1]:
# !pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 0.0/47.0 kB ? eta -:--:--
     ---------------- --------------------- 20.5/47.0 kB 682.7 kB/s eta 0:00:01
     ---------------- --------------------- 20.5/47.0 kB 682.7 kB/s eta 0:00:01
     -------------------------------------- 47.0/47.0 kB 337.6 kB/s eta 0:00:00
Downloading catboost-1.2.2-cp311-cp311-win_amd64.whl (101.0 MB)
   ---------------------------------------- 0.0/101.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/101.0 MB 1.1 MB/s eta 0:01:33
   ---------------------------------------- 0.1/101.0 MB 871.5 kB/s eta 0:01:56
   ---------------------------------------- 0.2/101.0 MB 1.7 MB/s eta 0:00:58
   ---------------------------------------- 0.2/101.0 MB 1.7 MB/s eta 0:00:58
   ---------------------------------------- 0.3/101.0 M

In [3]:
# import libararies 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from catboost import CatBoostClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
# data import titanic 

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## pre-processing

In [5]:
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

In [6]:
# imputer missing values usng KNN imputers in age 
from sklearn.impute import KNNImputer 
imputer = KNNImputer(n_neighbors=5)
df['age'] = imputer.fit_transform(df[['age']])

In [7]:
# impute embarked missiong values using pandas

df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])



In [9]:
# drop deck column
df.drop('deck', axis=1, inplace=True)

In [10]:
# df missing values 
df.isnull().sum().sort_values(ascending=False)

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  891 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [12]:
# convert each category columns to category 

categorical_cols = df.select_dtypes(include=['object', 'category']).columns

In [14]:
# add this as a new column in the dataframe 
df[categorical_cols] =df[categorical_cols].astype('category')

In [16]:
# split data into X and y 
X = df.drop('survived', axis=1)
y = df['survived']



In [17]:
# split data ionto train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# run the catboost classifier
model = CatBoostClassifier(iterations=100,
                           learning_rate=0.1,
                           depth=3,
                           loss_function='Logloss',
                           eval_metric='Accuracy',
                           random_seed=42,
                           verbose=False)

# train the model
model.fit(X_train, y_train, cat_features=categorical_cols.tolist())

# predictions
y_pred = model.predict(X_test)

# evaluate the model
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test, y_pred)}')

Accuracy Score: 1.0
Confusion Matrix: 
 [[105   0]
 [  0  74]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        74

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179



__________________