## 最低限のデータ確認

In [64]:
import os

for dirpath, dirnames, filenames in os.walk("../inputs"):
    for filename in filenames:
        print(os.path.join(dirpath,filename))

../inputs\.gitkeep
../inputs\data.csv
../inputs\sample_submission.csv
../inputs\train_flag.csv


In [65]:
import pandas as pd

data = pd.read_csv("../inputs/data.csv")
train_flag = pd.read_csv("../inputs/train_flag.csv")
init_sub = pd.read_csv("../inputs/sample_submission.csv")

In [66]:
data.head()

Unnamed: 0,jan_cd,item_name,item_spec,item_category_cd_1,item_category_cd_2,item_category_cd_3,item_category_name,average_unit_price,amount,total_price,...,membership_start_ym,age_category,sex,user_stage,user_flag_1,user_flag_2,user_flag_3,user_flag_4,user_flag_5,user_flag_6
0,4904230041160,ブラックニッカディープブレンド,７００ｍｌ,25,12,1,国産洋酒,1375.0,4,5500,...,201610.0,80代～,女性,メンバー,0,1,0,1,0,0
1,4901777284364,ＳＵ　角瓶ジャンボ,１９２０ｍｌ,25,12,1,国産洋酒,4895.0,1,4895,...,201702.0,50代,女性,ゴールド,0,0,0,0,0,0
2,280743000000,寿司セット（景福）,１パック,18,34,3,セット,1078.0,4,4312,...,202402.0,60代,女性,メンバー,0,1,0,0,0,0
3,247987000000,恵方巻（海鮮）,石狩１本,21,10,3,巻寿司,862.0,5,4310,...,202304.0,40代,女性,メンバー,0,0,0,0,0,0
4,4902222001192,コープ北海道トドックブレンド（エージ）,５ｋｇ,24,1,1,白米道産米,1825.0,2,3650,...,200410.0,50代,女性,メンバー,0,0,0,0,0,0


In [67]:
data.shape

(2757288, 24)

In [68]:
train_flag.shape

(29965, 2)

In [None]:
init_sub.shape

(10000, 2)

In [70]:
data["user_id"].value_counts()

user_id
13bd0916    3220
30dcdd5b    2526
2bdf28ad    2306
179444c9    2165
18a0cc3$    2116
            ... 
1721b3$7       1
35f8b0a        1
1895$c78       1
1c0fd1d7       1
15c1324c       1
Name: count, Length: 40496, dtype: int64

In [71]:
train_flag["churn"].value_counts()


churn
1    22022
0     7943
Name: count, dtype: int64

## 簡易的なモデル作成

In [72]:
features = pd.DataFrame(data.groupby("user_id").agg({
    "date": ["count"],
    "average_unit_price": ["sum"]
}).reset_index().to_numpy())

features.columns = ["user_id", "date_count", "average_unit_price_sum"]

features["average_unit_price_sum"] = pd.to_numeric(features["average_unit_price_sum"], errors="coerce")
features["date_count"] = pd.to_numeric(features["date_count"], errors="coerce")

In [73]:
features.head()

Unnamed: 0,user_id,date_count,average_unit_price_sum
0,$1$c92,7,1075.0
1,$1cd7f,6,1748.0
2,$1d062,2,276.0
3,$1d3a9,11,1420.0
4,$1f87d,1,1098.0


In [74]:
X_train = pd.merge(train_flag, features, on="user_id", how="left")
X_train = X_train.drop(["user_id", "churn"], axis=1)
y_train = train_flag["churn"].to_numpy()

In [75]:
# テストデータの特徴量を作成
X_test = pd.merge(init_sub[["user_id"]], features, on="user_id", how="left")
X_test = X_test.drop(["user_id"], axis=1)  # 予測に不要なID列を削除

In [76]:
y_train.shape

(29965,)

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

# cvに使うpredとtestに使うpredを分けとく
y_preds_cv = []
y_test_preds = []
models = []
oof_train = np.zeros(len(X_train))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

categorical_features = []

params = {
    "objective" : "binary",
    "max_bin" : 300,
    "learning_rate" : 0.1,
    "num_leaves" : 40,
    "metric" : "auc",
    "verbose" : -1
}

# 学習するときは毎回違うバリデーションデータ
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    # クロスバリデーション用分割
    X_tr = X_train.iloc[train_index, :]
    X_val = X_train.iloc[valid_index, :]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    # referenceを使うと、参照先のデータ構造をそのまま使って学習できる
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=categorical_features)

    # モデル
    model = lgb.train(params, lgb_train,
                        valid_sets=[lgb_train, lgb_eval],
                        num_boost_round=10000,
                        callbacks=[lgb.early_stopping(100),
                        lgb.log_evaluation(period=100)]
                        )

    # oof_trainをバリデーションデータの予測でどんどん埋めていく
    # modelはappendし解くことで後で使うことも可能（使ってない）
    oof_train[valid_index] = model.predict(X_val, num_iteration=model.best_iteration)
    models.append(model)

    # 各valごとに振り替えれるように保存し解く（使ってない）
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    y_preds_cv.append(y_pred)
    
    # 5分割毎回分のテストデータの予測をアンサンブル用に追加
    y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_test_preds.append(y_test_pred)
    
    

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.887442	valid_1's auc: 0.858916
Early stopping, best iteration is:
[15]	training's auc: 0.873381	valid_1's auc: 0.861709
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.885536	valid_1's auc: 0.86252
Early stopping, best iteration is:
[10]	training's auc: 0.870868	valid_1's auc: 0.864344
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.884245	valid_1's auc: 0.866523
Early stopping, best iteration is:
[15]	training's auc: 0.870557	valid_1's auc: 0.869833
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.886324	valid_1's auc: 0.857004
Early stopping, best iteration is:
[29]	training's auc: 0.876223	valid_1's auc: 0.860337
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.884842	valid_1's auc: 0.863131
Early stopping, best iteration is:
[21]	training's auc: 0.87

In [78]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train, oof_train)

0.859690756749917

## 提出ファイルの作成

In [None]:
# y_test_predsに5回分の予測が入ってるから、それを平均して提出
y_sub = sum(y_test_preds) / len(y_test_preds)
init_sub["pred"] = y_sub
init_sub.to_csv("../output/sub_1.csv", index=False)

init_sub.head()


Unnamed: 0,user_id,pred
0,$1d062,0.908284
1,$5$ab$4,0.885568
2,$5$f5af,0.885311
3,$623182,0.887064
4,$65b$2,0.721478
