In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier
import os

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['bank-full.csv', 'sample_submission.csv', 'test.csv', 'train.csv']

In [30]:
train_df = pd.read_csv(os.path.join(base_dir, files[3])).set_index("id")
target = train_df["y"]
train_df.drop("y", axis=1, inplace=True)
orig_df = pd.read_csv(os.path.join(base_dir, files[0]))
target_orig = orig_df["y"]
orig_df.drop("y", axis=1, inplace=True)

In [31]:
test_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")

In [32]:
categories = train_df.select_dtypes(include=["object"]).columns
non_categories = train_df.select_dtypes(exclude=["object"]).columns

In [33]:
# Encode Categorical data
le = LabelEncoder()
for column in categories:
    train_df[column] = le.fit_transform(train_df[column])
    orig_df[column] = le.fit_transform(orig_df[column])
    test_df[column] = le.fit_transform(test_df[column])
target_orig = le.fit_transform(target_orig)

In [34]:
# Scale non_categorical columns
sdt = StandardScaler()
# train_df[non_categories] = sdt.fit_transform(train_df[non_categories])
# orig_df[non_categories] = sdt.fit_transform(orig_df[non_categories])
# test_df[non_categories] = sdt.fit_transform(test_df[non_categories])

train_df = sdt.fit_transform(train_df)
orig_df = sdt.fit_transform(orig_df)
test_df = sdt.fit_transform(test_df)

In [35]:
X_train = train_df
X_test = test_df
y_train = target

In [36]:
# from imblearn.over_sampling import SMOTE
#
# sm = SMOTE(random_state=42)
# X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

The SMOTE cell above is commented out due to model performing even worse after being fitted onto the SMOTE dataset.

In [37]:
model = CatBoostClassifier()
model.fit(X_train, y_train, init_model="./saved_models/model_actual.bin")
model.save_model("./saved_models/model_actual.bin")

Learning rate set to 0.174014
0:	learn: 0.4481370	total: 57.8ms	remaining: 57.8s
1:	learn: 0.3256750	total: 173ms	remaining: 1m 26s
2:	learn: 0.2704316	total: 306ms	remaining: 1m 41s
3:	learn: 0.2402841	total: 366ms	remaining: 1m 31s
4:	learn: 0.2246100	total: 439ms	remaining: 1m 27s
5:	learn: 0.2148102	total: 507ms	remaining: 1m 23s
6:	learn: 0.2058376	total: 590ms	remaining: 1m 23s
7:	learn: 0.1997005	total: 663ms	remaining: 1m 22s
8:	learn: 0.1960804	total: 742ms	remaining: 1m 21s
9:	learn: 0.1934924	total: 813ms	remaining: 1m 20s
10:	learn: 0.1910200	total: 894ms	remaining: 1m 20s
11:	learn: 0.1888381	total: 973ms	remaining: 1m 20s
12:	learn: 0.1875792	total: 1.06s	remaining: 1m 20s
13:	learn: 0.1855971	total: 1.14s	remaining: 1m 20s
14:	learn: 0.1838431	total: 1.24s	remaining: 1m 21s
15:	learn: 0.1822943	total: 1.32s	remaining: 1m 21s
16:	learn: 0.1802928	total: 1.39s	remaining: 1m 20s
17:	learn: 0.1789282	total: 1.47s	remaining: 1m 20s
18:	learn: 0.1782040	total: 1.54s	remaining:

In [38]:
predictions = model.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'y': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [40]:
X_train = orig_df
y_train = target_orig

In [41]:
model.fit(X_train, y_train, init_model="./saved_models/model_actual.bin")
model.save_model("./saved_models/model_actual.bin")
predictions = model.predict(X_test)
predictions

Learning rate set to 0.052447
0:	learn: 0.2435365	total: 7.89ms	remaining: 7.89s
1:	learn: 0.2397775	total: 15.3ms	remaining: 7.62s
2:	learn: 0.2362767	total: 22.4ms	remaining: 7.44s
3:	learn: 0.2331021	total: 29.3ms	remaining: 7.3s
4:	learn: 0.2302198	total: 36.4ms	remaining: 7.24s
5:	learn: 0.2276650	total: 42.8ms	remaining: 7.08s
6:	learn: 0.2253065	total: 50.5ms	remaining: 7.17s
7:	learn: 0.2231552	total: 57.6ms	remaining: 7.14s
8:	learn: 0.2212873	total: 65.2ms	remaining: 7.18s
9:	learn: 0.2195604	total: 72.8ms	remaining: 7.21s
10:	learn: 0.2179359	total: 81ms	remaining: 7.29s
11:	learn: 0.2165390	total: 87.7ms	remaining: 7.22s
12:	learn: 0.2153502	total: 94.9ms	remaining: 7.21s
13:	learn: 0.2142447	total: 101ms	remaining: 7.13s
14:	learn: 0.2131322	total: 108ms	remaining: 7.12s
15:	learn: 0.2121716	total: 115ms	remaining: 7.1s
16:	learn: 0.2112239	total: 122ms	remaining: 7.07s
17:	learn: 0.2104042	total: 131ms	remaining: 7.14s
18:	learn: 0.2096762	total: 138ms	remaining: 7.1s
19:

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'y': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
