In [279]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [280]:
X_full = pd.read_parquet('data/train.parquet')

cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]
X_full[cat_cols] = X_full[cat_cols].astype("category")
X_full.drop(["id", "target_1", "target_2", 'channel_code', 'city',
    'index_city_code','ogrn_month', 'ogrn_year', 'branch_code', 
       'ogrn_days_end_month', 'ogrn_days_end_quarter','min_end_fact_fin_deals',
               'min_end_plan_non_fin_deals', 'min_start_fin_deals', 'min_start_non_fin_deals',
               'ft_registration_date'], axis=1, inplace=True)



X = X_full.drop("total_target", axis=1)
y = X_full.total_target

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.3,
                                                  random_state=12)

In [281]:
model = LGBMClassifier(n_estimators=276,reg_alpha=10,reg_lambda=10,subsample_for_bin=200000, random_state=12,
                       colsample_bytree=0.45)
model.set_params(n_jobs=5)
model.fit(x_train, y_train)

y_pred = model.predict_proba(x_val)[:, 1]
roc_auc_score(y_val, y_pred)

[LightGBM] [Info] Number of positive: 14818, number of negative: 195182
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19539
[LightGBM] [Info] Number of data points in the train set: 210000, number of used features: 86
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.070562 -> initscore=-2.578090
[LightGBM] [Info] Start training from score -2.578090


0.8844685030784825

In [274]:
X_test = pd.read_parquet('data/test.parquet')
cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]
X_test[cat_cols] = X_test[cat_cols].astype("category")
X_test.drop(["id", 'channel_code', 'city',
    'index_city_code','ogrn_month', 'ogrn_year', 'branch_code',
       'ogrn_days_end_month', 'ogrn_days_end_quarter','min_end_fact_fin_deals',
               'min_end_plan_non_fin_deals', 'min_start_fin_deals', 'min_start_non_fin_deals',
               'ft_registration_date'], axis=1, inplace=True)

In [275]:
test_score = model.predict_proba(X_test)[:, 1]

In [276]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")
sample_submission_df.head()

Unnamed: 0,id,score
0,300000,0.5
1,300001,0.5
2,300002,0.5
3,300003,0.5
4,300004,0.5


In [277]:
sample_submission_df["score"] = test_score
sample_submission_df.head()

Unnamed: 0,id,score
0,300000,0.285926
1,300001,0.580351
2,300002,0.009964
3,300003,0.009018
4,300004,0.004328


In [278]:
sample_submission_df.to_csv("my_submission.csv", index=False)