Import libraries


In [31]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
from sklearn.impute import SimpleImputer

Remove warnings


In [32]:
warnings.filterwarnings('ignore')

load data and define categorical variables

In [33]:
X_full = pd.read_parquet('data/train.parquet')

cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]

X_full[cat_cols] = X_full[cat_cols].astype("category")
X_full.drop(labels=['id', 'target_1', 'target_2'], axis=1, inplace=True)

y will be result column to train and validate model

In [34]:
y = X_full['total_target']

X_full.drop('total_target', axis=1, inplace=True)

Impute data and select k best features using SelectKBest from sklearn then print removed features

In [50]:
my_imputer = SimpleImputer()
X_new = pd.DataFrame(my_imputer.fit_transform(X_full.copy()))

fvalue_selector = SelectKBest(f_classif, k=96)
X_kbest = fvalue_selector.fit_transform(X_new, y)

bad_names = [x for x in X_full.columns if x not in X_full.columns[fvalue_selector.get_support()]]
bad_names

['ogrn_days_end_month', 'ogrn_days_end_quarter', 'cnt_cred_f_oper_3m']

Split data to test and train

In [51]:
x_train, x_val, y_train, y_val = train_test_split(X_kbest, y,
                                                  test_size=0.3,
                                                  random_state=191)

1. Define model with parameters that were selected using random search and cross validation;
2. Fit and predict;
3. Score model by roc_auc_score.

In [52]:
model = LGBMClassifier(n_estimators=229,reg_alpha=10,reg_lambda=10,subsample_for_bin=200000, random_state=12,
                       colsample_bytree=0.45,verbose=-1, n_jobs=5)
model.fit(x_train, y_train)

y_pred = model.predict_proba(x_val)[:, 1]

roc_auc_score(y_val, y_pred)

0.8873265511737458

Applying previous steps to clear test data and make prediction.

In [53]:
X_test = pd.read_parquet('data/test.parquet')
cat_cols = [
    'channel_code', 'city', 'city_type',
    'index_city_code', 'ogrn_month', 'ogrn_year',
    'branch_code', 'okved', 'segment'
]

X_test[cat_cols] = X_test[cat_cols].astype("category")
X_test.drop(labels=['id']+bad_names, axis=1, inplace=True)

my_imputer = SimpleImputer(strategy='most_frequent')
X_test_new = pd.DataFrame(my_imputer.fit_transform(X_test.copy()))
test_score = model.predict_proba(X_test)[:, 1]

Load sample submission file and fill with our predictions

In [55]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")
sample_submission_df.head()
sample_submission_df["score"] = test_score
sample_submission_df.head()

Unnamed: 0,id,score
0,300000,0.074172
1,300001,0.151056
2,300002,0.00421
3,300003,0.005599
4,300004,0.005088


Save predictions to csv file

In [56]:
sample_submission_df.to_csv("my_submission.csv", index=False)