# Install CatBoost Library

In [1]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


# Importing the libraries

In [49]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import  f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

# Importing the dataset

In [62]:
dataset = pd.read_csv("Data.csv")

# Taking a look in the data set

In [63]:
dataset.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [64]:
dataset.dtypes

Sample code number             int64
Clump Thickness                int64
Uniformity of Cell Size        int64
Uniformity of Cell Shape       int64
Marginal Adhesion              int64
Single Epithelial Cell Size    int64
Bare Nuclei                    int64
Bland Chromatin                int64
Normal Nucleoli                int64
Mitoses                        int64
Class                          int64
dtype: object

In [65]:
dataset.shape

(683, 11)

# Take a quick look at the Target column

In [66]:
dataset['Class'].value_counts()

2    444
4    239
Name: Class, dtype: int64

# Taking care of missing data

In [67]:
dataset.isnull().sum()[dataset.isnull().sum() > 0]

Series([], dtype: int64)

# Encoding categorical data

Label Encoding


In [68]:
dataset['Class'] = LabelEncoder().fit_transform(dataset['Class'])

In [69]:
dataset['Class'].unique()

array([0, 1])

# Create X for features and y for target

In [70]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set

In [71]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state = 0)

# Feature Scaling

In [72]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Building model

In [75]:
results = []

# XGBoost :
classifier_xgb = XGBClassifier()

classifier_xgb.fit(X_train, y_train)

y_pred_xgb = classifier_xgb.predict(X_test)

results.append({
        'Model Name':'XGBoost',
        'Accuracy ': accuracy_score(y_pred_xgb, y_test)*100,
        'F1_score': f1_score(y_pred_xgb, y_test)*100 })

# CatBoost :
classifier_cb = CatBoostClassifier(verbose=False)

classifier_cb.fit(X_train, y_train)

y_pred_cb = classifier_cb.predict(X_test)

results.append({
        'Model Name':'CatBoost',
        'Accuracy ': accuracy_score(y_pred_cb, y_test)*100,
        'F1_score': f1_score(y_pred_cb, y_test)*100 })


In [76]:
df_report = pd.DataFrame(results)
styled_df = df_report.style.background_gradient(cmap='Blues')
styled_df

Unnamed: 0,Model Name,Accuracy,F1_score
0,XGBoost,97.080292,96.078431
1,CatBoost,97.810219,97.087379


## Applying k-Fold Cross Validation

In [77]:
accuracies_catboost = cross_val_score(estimator = classifier_cb, X = X_train, y = y_train, cv = 10)
accuracies_xgboost = cross_val_score(estimator = classifier_xgb, X = X_train, y = y_train, cv = 10)

results = {
    'Model': ['CatBoost', 'XgBoost'],
    'Accuracy': [accuracies_catboost.mean()*100, accuracies_xgboost.mean()*100],
    'Standard Deviation': [accuracies_catboost.std()*100, accuracies_xgboost.std()*100]
}

df_report2 = pd.DataFrame(results)
styled_df = df_report2.style.background_gradient(cmap='Blues')
styled_df

Unnamed: 0,Model,Accuracy,Standard Deviation
0,CatBoost,96.525253,2.501246
1,XgBoost,96.888889,2.169991


# Applying Grid Search in XgBoost to find the best parameters

In [81]:
parameters = [{
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'n_estimators': [100, 500, 1000, 1500, 2000]
    }]

grid_search = GridSearchCV(estimator = classifier_xgb, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best_Parameters: ", best_parameters)

Best Accuracy: 97.07 %
Best_Parameters:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1500}


# Making single predict

In [105]:
predict = grid_search.predict([[5,4,4,5,7,10,3,2,1]])
if predict == 0 :
  print('The tumor is benign')
else:
  print("The tumor is malignant")

The tumor is malignant
