In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("features/train_fourth_dataset.csv")

In [3]:
df = df.drop("PassengerId", axis=1)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df.drop("Transported", axis=1), df['Transported'].astype(int))

In [5]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

In [6]:
model1 = LGBMClassifier(n_estimators=600)

In [7]:
model1.fit(x_train, y_train)

In [8]:
estimate_model(model1)

Testing performance
RMSE: 0.46
R2: 0.15
Score: 0.7879
Local Score: 0.9870
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 600, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


#### Добавим GridSearchCV

In [11]:
model2 = LGBMClassifier()

In [10]:
param_grid = {
    'learning_rate': [0.02, 0.01],
    'n_estimators': [300],
    "max_depth": [20, 19],
    'num_leaves': [33, 34, 32],
   'subsample' : [0.5, 0.4 ],
    #"device": ["gpu"]
}

In [None]:
grid_search = GridSearchCV(model2, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

In [26]:
estimate_model(model2)

Testing performance
RMSE: 0.43
R2: 0.25
Score: 0.8123
Local Score: 0.8711
Best params:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.02, 'max_depth': 20, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 300, 'n_jobs': -1, 'num_leaves': 33, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 0.5, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [15]:
import pickle

In [20]:
with open("models/train3_lightbv_123", "wb") as file:
    pickle.dump(model2, file)

In [28]:
test = pd.read_csv("features/test_third_uncomplite.csv")

In [29]:
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,Cabin_num,Cabin_side,TotalServices,Sex
0,0013_01,2,1,1,27.00,0,0.0,0.0,0.0,0.0,0.0,7,3,2,0.0,0
1,0018_01,2,0,1,19.00,0,0.0,9.0,0.0,2823.0,0.0,6,4,2,2832.0,1
2,0019_01,1,1,3,31.00,0,0.0,0.0,0.0,0.0,0.0,3,0,2,0.0,1
3,0021_01,1,0,1,38.00,0,0.0,6652.0,0.0,181.0,585.0,3,1,2,6833.0,1
4,0023_01,2,0,1,20.00,0,10.0,0.0,635.0,0.0,0.0,6,5,2,645.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,2,1,1,34.00,0,0.0,0.0,0.0,0.0,0.0,7,1496,2,0.0,1
4273,9269_01,2,0,1,42.00,0,0.0,847.0,17.0,10.0,144.0,6,427,2,874.0,1
4274,9271_01,3,1,3,28.82,0,0.0,0.0,0.0,0.0,0.0,4,296,1,0.0,1
4275,9273_01,1,0,1,28.82,0,0.0,2680.0,0.0,0.0,523.0,4,297,1,2680.0,-1


In [39]:
test[['PassengerId', "Transported"]]

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [34]:
pred = model2.predict(test.drop(["PassengerId"], axis=1))

In [38]:
test['Transported'] = pred.astype(bool)

In [33]:
pd.DataFrame(pred[:,1]).to_csv("results/proba-train_3-lightgb.csv", index=False)

In [61]:
test["Transported"] = pred.astype(bool)

In [78]:
test = pd.read_csv("features/test_second.csv")
pred_proba = model2.predict_proba(test.drop(["PassengerId"], axis=1))[:,1]
test["Transported"] = pred_proba

In [40]:
test[['PassengerId', "Transported"]].to_csv("results/train_3_ligtg.csv", index=False)

In [94]:
df1 = pd.read_csv("results/lightbmbset6.csv")
df2 = pd.read_csv("results/lightbb7.csv")