In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [74]:
df = pd.read_csv("features/data_first.csv")

In [67]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,Cabin_num,Cabin_side,Transported
0,1,0,1,39.0,0,0.0,0.0,0.0,0.0,0.0,2,0,1,0
1,2,0,1,24.0,0,109.0,9.0,25.0,549.0,44.0,6,0,2,1
2,1,0,1,58.0,1,43.0,3576.0,0.0,6715.0,49.0,1,0,2,0
3,1,0,1,33.0,0,0.0,1283.0,371.0,3329.0,193.0,1,0,2,0
4,2,0,1,16.0,0,303.0,70.0,151.0,565.0,2.0,6,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,3,41.0,1,0.0,6819.0,0.0,1643.0,74.0,1,98,1,0
8689,2,1,2,18.0,0,0.0,0.0,0.0,0.0,0.0,7,1499,2,0
8690,2,0,1,26.0,0,0.0,0.0,1872.0,1.0,0.0,7,1500,2,1
8691,1,0,3,32.0,0,0.0,1049.0,0.0,353.0,3235.0,5,608,2,0


In [75]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("Transported", axis=1), df['Transported'])

In [6]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

In [24]:
model1 = LGBMClassifier(n_estimators=600)

In [25]:
model1.fit(x_train, y_train)

In [26]:
estimate_model(model1)

Testing performance
RMSE: 0.45
R2: 0.17
Score: 0.7936
Local Score: 0.9927
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 600, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


#### Добавим GridSearchCV

In [69]:
model2 = LGBMClassifier()

In [70]:
param_grid = {
    'learning_rate': [0.04, 0.05],
    'n_estimators': [300],
    "max_depth": [16],
    'num_leaves': [33, 35],
    'objective' : ['binary'],
    'random_state' : [400],
    'colsample_bytree' : [0.5,0.4],
   'subsample' : [0.5, 0.3, ],
    'reg_alpha' : [1, 0.5],
   'reg_lambda' : [1, 2],
    #"device": ["gpu"]
}

In [76]:
grid_search = GridSearchCV(model2, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

In [77]:
estimate_model(model2)

Testing performance
RMSE: 0.42
R2: 0.28
Score: 0.8196
Local Score: 0.8962
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.5, 'importance_type': 'split', 'learning_rate': 0.04, 'max_depth': 16, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': -1, 'num_leaves': 33, 'objective': 'binary', 'random_state': 400, 'reg_alpha': 1, 'reg_lambda': 2, 'silent': 'warn', 'subsample': 0.5, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [15]:
import pickle

In [61]:
with open("models/mymodel2", "wb") as file:
    pickle.dump(model2, file)

In [81]:
test = pd.read_csv("features/test_second.csv")

In [82]:
pred = model2.predict(test.drop(["PassengerId"], axis=1))

In [83]:
test["Transported"] = pred.astype(bool)

In [78]:
test = pd.read_csv("features/test_second.csv")
pred_proba = model2.predict_proba(test.drop(["PassengerId"], axis=1))[:,1]
test["Transported"] = pred_proba

In [84]:
test[['PassengerId', "Transported"]].to_csv("results/lightbb7.csv", index=False)

In [94]:
df1 = pd.read_csv("results/lightbmbset6.csv")
df2 = pd.read_csv("results/lightbb7.csv")