In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from catboost import CatBoostClassifier
import os

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['bank-full.csv', 'sample_submission.csv', 'test.csv', 'train.csv']

In [24]:
train_df = pd.read_csv(os.path.join(base_dir, files[3])).set_index("id")
target = train_df["y"]
train_df.drop("y", axis=1, inplace=True)
orig_df = pd.read_csv(os.path.join(base_dir, files[0]))
actual = orig_df["y"]
orig_df.drop("y", axis=1, inplace=True)

In [25]:
categories = train_df.select_dtypes(include=["object"]).columns
non_categories = train_df.select_dtypes(exclude=["object"]).columns

In [31]:
# Encode Categorical data
le = LabelEncoder()
for column in categories:
    train_df[column] = le.fit_transform(train_df[column])
    orig_df[column] = le.fit_transform(orig_df[column])
actual = le.fit_transform(actual)

In [27]:
# Scale non_categorical columns
sdt = StandardScaler()
train_df[non_categories] = sdt.fit_transform(train_df[non_categories])
orig_df[non_categories] = sdt.fit_transform(orig_df[non_categories])

In [32]:
X_train = train_df
X_test = orig_df
y_train = target
y_test = actual

In [None]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

Learning rate set to 0.174014
0:	learn: 0.4481370	total: 117ms	remaining: 1m 57s
1:	learn: 0.3256750	total: 268ms	remaining: 2m 13s
2:	learn: 0.2704316	total: 446ms	remaining: 2m 28s
3:	learn: 0.2402841	total: 686ms	remaining: 2m 50s
4:	learn: 0.2246100	total: 949ms	remaining: 3m 8s
5:	learn: 0.2148102	total: 1.07s	remaining: 2m 58s
6:	learn: 0.2058376	total: 1.2s	remaining: 2m 49s
7:	learn: 0.1997005	total: 1.34s	remaining: 2m 46s
8:	learn: 0.1960804	total: 1.48s	remaining: 2m 43s
9:	learn: 0.1934924	total: 1.58s	remaining: 2m 36s
10:	learn: 0.1910200	total: 1.7s	remaining: 2m 32s
11:	learn: 0.1888381	total: 1.83s	remaining: 2m 30s
12:	learn: 0.1875792	total: 1.96s	remaining: 2m 29s
13:	learn: 0.1855971	total: 2.07s	remaining: 2m 25s
14:	learn: 0.1838431	total: 2.17s	remaining: 2m 22s
15:	learn: 0.1822943	total: 2.3s	remaining: 2m 21s
16:	learn: 0.1802928	total: 2.43s	remaining: 2m 20s
17:	learn: 0.1789282	total: 2.54s	remaining: 2m 18s
18:	learn: 0.1782040	total: 2.67s	remaining: 2m 

In [33]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)
print(accuracy)
print(report)
print(matrix)

0.8978345977748778
              precision    recall  f1-score   support

           0       0.95      0.94      0.94     39922
           1       0.56      0.59      0.58      5289

    accuracy                           0.90     45211
   macro avg       0.75      0.76      0.76     45211
weighted avg       0.90      0.90      0.90     45211

[[37464  2458]
 [ 2161  3128]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [None]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)
print(accuracy)
print(report)
print(matrix)