In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from catboost import CatBoostClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
df = pd.read_csv("features/train_foruth_nonull.csv")

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8693 non-null   int64  
 2   CryoSleep      8693 non-null   int64  
 3   Destination    8693 non-null   int64  
 4   Age            8693 non-null   float64
 5   VIP            8693 non-null   int64  
 6   RoomService    8693 non-null   float64
 7   FoodCourt      8693 non-null   float64
 8   ShoppingMall   8693 non-null   float64
 9   Spa            8693 non-null   float64
 10  VRDeck         8693 non-null   float64
 11  Transported    8693 non-null   int64  
 12  Cabin_deck     8693 non-null   int64  
 13  Cabin_num      8693 non-null   int64  
 14  Cabin_side     8693 non-null   int64  
 15  Sex            8693 non-null   int64  
 16  TotalServices  8693 non-null   float64
 17  MinServices    8693 non-null   float64
 18  MeanServ

In [29]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(["Transported", "PassengerId"], axis=1), df['Transported'].astype(int))

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side
2073,0,1,1,29.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0
3033,0,0,2,43.0,0,0.0,16.0,34.0,753.0,1.0,0,1,0
4603,0,1,2,46.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0
192,0,0,2,21.0,0,367.0,281.0,0.0,0.0,146.0,0,1,0
5528,0,1,2,27.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,0,1,1,3.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0
2403,0,0,2,28.0,0,4.0,166.0,743.0,0.0,0.0,0,1,0
6315,1,2,0,15.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0
6377,2,0,0,16.0,0,1337.0,0.0,2614.0,0.0,0.0,0,1,0


In [30]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

In [108]:
model1 = CatBoostClassifier(iterations=800)

In [109]:
model1.fit(x_train, y_train)

Learning rate set to 0.028149
0:	learn: 0.6830721	total: 3.93ms	remaining: 3.14s
1:	learn: 0.6704169	total: 7.66ms	remaining: 3.06s
2:	learn: 0.6587241	total: 11.4ms	remaining: 3.04s
3:	learn: 0.6467988	total: 14.9ms	remaining: 2.96s
4:	learn: 0.6363777	total: 18.9ms	remaining: 3s
5:	learn: 0.6254872	total: 22.3ms	remaining: 2.95s
6:	learn: 0.6172637	total: 25.7ms	remaining: 2.91s
7:	learn: 0.6097639	total: 29.2ms	remaining: 2.89s
8:	learn: 0.6025317	total: 32.6ms	remaining: 2.87s
9:	learn: 0.5956772	total: 36.3ms	remaining: 2.87s
10:	learn: 0.5881937	total: 40.1ms	remaining: 2.87s
11:	learn: 0.5829793	total: 43.9ms	remaining: 2.88s
12:	learn: 0.5766323	total: 47.6ms	remaining: 2.88s
13:	learn: 0.5711315	total: 51ms	remaining: 2.86s
14:	learn: 0.5656004	total: 54.5ms	remaining: 2.85s
15:	learn: 0.5600745	total: 58.4ms	remaining: 2.86s
16:	learn: 0.5536350	total: 61.9ms	remaining: 2.85s
17:	learn: 0.5493212	total: 65.5ms	remaining: 2.85s
18:	learn: 0.5444957	total: 69.1ms	remaining: 2.8

<catboost.core.CatBoostClassifier at 0x24c030c4100>

In [110]:
estimate_model(model1)

Testing performance
RMSE: 0.43
R2: 0.25
Score: 0.8114
Local Score: 0.8859
Best params:  {'iterations': 800}


In [34]:
model2 = CatBoostClassifier()

In [61]:
param_grid = {
    'learning_rate': [0.1, 0.2],
    'depth': [5],
    'iterations': [600],
    'l2_leaf_reg': [5,  3],
}

In [62]:
grid_search = GridSearchCV(estimator=model2, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

0:	learn: 0.6592336	total: 3.01ms	remaining: 1.8s
1:	learn: 0.6225544	total: 6.08ms	remaining: 1.82s
2:	learn: 0.6023959	total: 8.98ms	remaining: 1.79s
3:	learn: 0.5828712	total: 11.9ms	remaining: 1.77s
4:	learn: 0.5614452	total: 14.8ms	remaining: 1.76s
5:	learn: 0.5472389	total: 17.7ms	remaining: 1.75s
6:	learn: 0.5307495	total: 20.5ms	remaining: 1.74s
7:	learn: 0.5200519	total: 23.4ms	remaining: 1.73s
8:	learn: 0.5106823	total: 26.3ms	remaining: 1.72s
9:	learn: 0.5036023	total: 29.1ms	remaining: 1.72s
10:	learn: 0.4991271	total: 31.9ms	remaining: 1.71s
11:	learn: 0.4892202	total: 35.1ms	remaining: 1.72s
12:	learn: 0.4844268	total: 38.2ms	remaining: 1.72s
13:	learn: 0.4790489	total: 41.2ms	remaining: 1.72s
14:	learn: 0.4735779	total: 44.1ms	remaining: 1.72s
15:	learn: 0.4691887	total: 47.1ms	remaining: 1.72s
16:	learn: 0.4660572	total: 50.5ms	remaining: 1.73s
17:	learn: 0.4625910	total: 53.3ms	remaining: 1.72s
18:	learn: 0.4580330	total: 56ms	remaining: 1.71s
19:	learn: 0.4557506	tota

In [63]:
estimate_model(model2)

Testing performance
RMSE: 0.44
R2: 0.22
Score: 0.8059
Local Score: 0.9130
Best params:  {'iterations': 600, 'learning_rate': 0.1, 'depth': 5, 'l2_leaf_reg': 5}


In [104]:
test = pd.read_csv("features/test_fourth_dataset.csv")

In [105]:
pred = model1.predict(test.drop(["PassengerId"], axis=1))

In [45]:
pd.DataFrame(pred[:,1]).to_csv("results/proba-GPTcatboost_1.csv", index=False)

In [106]:
test["Transported"] = pred.astype(bool)

In [107]:
test[['PassengerId', "Transported"]].to_csv("results/train_4nonull_catboostmodel1_3.csv", index=False)

In [46]:
test = pd.read_csv("features/test_second.csv")
pred_proba = model2.predict_proba(test.drop(["PassengerId"], axis=1))[:,1]
test["Transported"] = pred_proba

In [25]:
test.to_csv("results/train_4_catboost")

In [82]:
import pickle
with open("models/model_train4_catboostmodel1", "wb") as file:
    pickle.dump(model1, file)