In [75]:
import os
import gc
import sys
import time
import datetime
import pandas as pd
import numpy as np
from contextlib import contextmanager
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import Pool
from catboost import CatBoostRegressor,CatBoostClassifier

In [31]:
import itertools
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# ===============
# Feature Engineering
# ===============

class SinCos():
    def __init__(self, feature_name, period):
        '''
        input
        ---
        feature_name(str): name of feature
        period(int): period of feature
        '''
        self.feature_name = feature_name
        self.period = period

    def create_features(self, df):
        df['{}_sin'.format(self.feature_name)] = np.sin(2 * np.pi * df[self.feature_name]/self.period)
        df['{}_cos'.format(self.feature_name)] = np.cos(2 * np.pi * df[self.feature_name] / self.period)
        new_cols = ["{}_{}".format(self.feature_name, key) for key in ["sin", "cos"]]

        return df, new_cols


class Frequency():
    def __init__(self, categorical_columns):
        '''
        input
        ---
        categorical_columns(list): categorical columns
        '''
        self.categorical_columns = categorical_columns

    def create_features(self, df):
        new_cols = []
        for index,col in enumerate(self.categorical_columns):
            print("======{}/{}=====".format(index,len(self.categorical_columns)))
            fname = '{}_Frequency'.format(col)
            df[fname] = df.groupby(col)[col].transform('count') / len(df)
            new_cols.append(fname)
        return df, new_cols


In [3]:
import logging
import sys

LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")


def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

In [83]:
import lightgbm as lgb
from scipy.stats import rankdata

def train_lgbm(X_train, y_train, X_valid, y_valid, X_test, categorical_features,lgb_params,fit_params, model_name,
               loss_func, rank=False, calc_importances=True):
    train = lgb.Dataset(X_train, y_train,categorical_feature=categorical_features)
    if X_valid is not None:
        valid = lgb.Dataset(X_valid, y_valid,categorical_feature=categorical_features)
    evals_result = {}
    if X_valid is not None:
        model = lgb.train(
            lgb_params,
            train,
            valid_sets=[valid],
            valid_names=['valid'],
            evals_result=evals_result,
            **fit_params
        )
    else:
        model = lgb.train(
            lgb_params,
            train,
            evals_result=evals_result,
            **fit_params
        )
    LOGGER.info(f'Best Iteration: {model.best_iteration}')

    # train score
    if X_valid is None:
        y_pred_train = model.predict(X_train, num_iteration=fit_params["num_boost_round"])
        y_pred_train[y_pred_train<0] = 0
        train_loss = loss_func(y_train, y_pred_train)
    else:
        y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
        y_pred_train[y_pred_train < 0] = 0
        train_loss = loss_func(y_train, y_pred_train)

    if X_valid is not None:
        # validation score
        y_pred_valid = model.predict(X_valid)
        y_pred_valid[y_pred_valid < 0] = 0
        valid_loss = loss_func(y_valid, y_pred_valid)
        # save prediction
        np.save(f'{model_name}_train.npy', y_pred_valid)
    else:
        y_pred_valid = None
        valid_loss = None

    # save model
    model.save_model(f'{model_name}.txt')

    if X_test is not None:
        # predict test
        y_pred_test = model.predict(X_test)
        y_pred_test[y_pred_test < 0] = 0
        # save prediction
        np.save(f'{model_name}.npy', y_pred_test)
    else:
        y_pred_test = None

    if calc_importances:
        importances = pd.DataFrame()
        importances['feature'] = feature_name
        importances['gain'] = model.feature_importance(importance_type='gain')
        importances['split'] = model.feature_importance(importance_type='split')
    else:
        importances = None

    return y_pred_valid, y_pred_test, train_loss, valid_loss, importances, model.best_iteration

def train_cat(X_train, y_train, X_valid, y_valid, X_test, categorical_features,model_name,loss_func,rank=False):

    train = Pool(X_train, y_train, cat_features=categorical_features)
    valid = Pool(X_valid, y_valid, cat_features=categorical_features)
    evals_result = {}
    model = CatBoostRegressor(random_seed=0, learning_rate=0.1,num_boost_round = 5000,loss_function='RMSE')
    model.fit(train,
              eval_set=valid,  # 検証用データ
              early_stopping_rounds=1000,  # 10回以上精度が改善しなければ中止
              verbose= 100,
              use_best_model=True,  # 最も精度が高かったモデルを使用するかの設定
              plot=False)  # 誤差の推移を描画するか否かの設定

    y_pred_train = model.predict(X_train)
    y_pred_train[y_pred_train<0] = 0
    train_loss = loss_func(y_train, y_pred_train)
    if X_valid is not None:
        # validation score
        y_pred_valid = model.predict(X_valid)
        y_pred_valid[y_pred_valid < 0] = 0
        valid_loss = loss_func(y_valid, y_pred_valid)
        # save prediction
        np.save(f'{model_name}.npy', y_pred_valid)
    else:
        y_pred_valid = None
        valid_loss = None

    # save model
    model.save_model(f'{model_name}.txt')

    if X_test is not None:
        # predict test
        y_pred_test = model.predict(X_test)
        y_pred_test[y_pred_test < 0] = 0
        # save prediction
        np.save(f'{model_name}.npy', y_pred_test)
    else:
        y_pred_test = None

    return y_pred_valid, y_pred_test, train_loss, valid_loss

ここから始まり

train,test準備

In [7]:
df = pd.read_csv("./master.csv")

In [8]:
train = df[df["target_flag"]==0]
test = df[df["target_flag"]==1]
train.to_csv("./train.csv")
test.to_csv("./test.csv")

In [45]:
sampling_train = pd.read_csv('./train.csv')[-8000000:]#ここを6月のみ取るようにする

In [46]:
len(sampling_train)

8000000

In [47]:
test = pd.read_csv("./test.csv")
y = sampling_train['imp'].copy()
n_train = len(sampling_train)
concat_train_test = sampling_train.append(test).reset_index(drop=True)
gc.collect()

85

In [48]:
import copy
train = copy.deepcopy(concat_train_test)

特徴量作成

cluster_cf_idを分離してカテゴリー特徴に

In [49]:
print('[feature]: make age feature...')
d = {
    1 : 'u20', 2 : 'u20', 3 : 'u20', 4 : 'u20', 5 : 'u20'
    , 6 : '20~34', 7 : '20~34', 8 : '20~34', 9 : '20~34', 10 : '20~34'
    , 11 : '35u', 12 : '35u', 13 : '35u', 14 : '35u', 15 : '35u'
    , 16 : '20~34', 17 : '20~34', 18 : '20~34', 19 : '20~34', 20 : '20~34'
    , 21 : '35u', 22 : '35u', 23 : '35u', 24 : '35u', 25 : '35u'
    , 26 : 'null', 27 : 'null', 28 : 'null', 29 : 'null', 30 : 'null'
}
age_df = pd.DataFrame(d.values(), index=d.keys()).reset_index()
age_df.columns = ['cluster_cf_id', 'age']

print('[feature]: make gender feature...')
d = {
1 : 'null', 2 : 'null', 3 : 'null', 4 : 'null', 5 : 'null'
, 6 : 'm', 7 : 'm', 8 : 'm', 9 : 'm', 10 : 'm'
, 11 : 'm', 12 : 'm', 13 : 'm', 14 : 'm', 15 : 'm'
, 16 : 'f', 17 : 'f', 18 : 'f', 19 : 'f', 20 : 'f'
, 21 : 'f', 22 : 'f', 23 : 'f', 24 : 'f', 25 : 'f'
, 26 : 'null', 27 : 'null', 28 : 'null', 29 : 'null', 30 : 'null'
}
gender_df = pd.DataFrame(d.values(), index=d.keys()).reset_index()
gender_df.columns = ['cluster_cf_id', 'gender']

print('[feature]: make cf feature...')
cf_dic = {
1 : 'very little', 2 : 'little', 3 : 'normal', 4 : 'much', 5 : 'very much'
, 6 : 'very little', 7 : 'little', 8 : 'normal', 9 : 'much', 10 : 'very much'
, 11 : 'very little', 12 : 'little', 13 : 'normal', 14 : 'much', 15 : 'very much'
, 16 : 'very little', 17 : 'little', 18 : 'normal', 19 : 'much', 20 : 'very much'
, 21 : 'very little', 22 : 'little', 23 : 'normal', 24 : 'much', 25 : 'very much'
, 26 : 'very little', 27 : 'little', 28 : 'normal', 29 : 'much', 30 : 'very much'
}
cf_df = pd.DataFrame(d.values(), index=d.keys()).reset_index()
cf_df.columns = ['cluster_cf_id', 'cf']

print('[feature]: join features...')
train = pd.merge(train, age_df, on='cluster_cf_id', how='left')
train = pd.merge(train, gender_df, on='cluster_cf_id', how='left')
train = pd.merge(train, cf_df, on='cluster_cf_id', how='left')

[feature]: make age feature...
[feature]: make gender feature...
[feature]: make cf feature...
[feature]: join features...


日付データに対して日にち等を分離、かつsin,cosで連続値に

In [50]:
train["cm_start_at"] = pd.to_datetime(train["cm_start_at"])
train["date_day"] = train["cm_start_at"].dt.day
train["date_week"] = train["cm_start_at"].dt.week
train["date_dayofweek"] = train["cm_start_at"].dt.dayofweek
train["date_hour"] = train["cm_start_at"].dt.hour
sincos = SinCos(feature_name="date_dayofweek", period=6)
train, _ = sincos.create_features(train)
sincos = SinCos(feature_name="date_day", period=30)
train, _ = sincos.create_features(train)
sincos = SinCos(feature_name="date_week", period=6)
train, _ = sincos.create_features(train)
sincos = SinCos(feature_name="date_hour", period=23)
train, _ = sincos.create_features(train)

後の特徴量作成のために準備

In [51]:
categorical_features = ["date_hour","date_dayofweek","date_week","date_day","cf","gender","age","cue_point_sequence","cue_point_role","channel_id","genre_id","series_id","series_title","specified","lived"]

target_encoding

In [52]:
target = Target(categorical_features,"imp")
train, new_cols = target.create_features(train)



Frequency_encoding

In [53]:
frequency = Frequency(categorical_features)
train, new_cols = frequency.create_features(train)



In [54]:
gc.collect()

392

パラメータ設定

In [71]:
SEED = 0
LGBM_PARAMS = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': "rmse",
    'learning_rate': 0.1,
    'verbose': -1,
    'nthread': -1,
    'seed': SEED,
}
LGBM_FIT_PARAMS = {
    'num_boost_round': 5000,
    'early_stopping_rounds': 1000,
    'verbose_eval': 500,
}

損失関数設定

In [56]:
def calc_loss(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

train準備

In [57]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

カテゴリー変数を数字にする

In [58]:
for index,c in enumerate(categorical_features):
    print("======{}/{}=====".format(index,len(categorical_features)))
    lbl = LabelEncoder()
    lbl.fit(list(train[c].astype("str").values))
    train[c] = lbl.transform(list(train[c].astype("str").values))



いらないカラムを落とす

In [59]:
train_drop_cols = ["campaign_id","cue_point_id","cm_start_at","imp","Unnamed: 0","target_flag","cluster_cf_id"]
train = train.drop(train_drop_cols, axis=1)

train,test分離

In [60]:
train_test = train[n_train:]
train_train = train[:n_train]

In [61]:
x_train, x_valid, y_train, y_valid = train_test_split(train_train,y,test_size=0.3,random_state=0)

train:rightgbm

In [72]:
y_pred_valid, y_pred_test, train_loss, valid_loss, importances, best_iter = train_lgbm(
                x_train, y_train, x_valid, y_valid, train_test,
                categorical_features=categorical_features,
                lgb_params=LGBM_PARAMS,
                fit_params=LGBM_FIT_PARAMS,
                loss_func=calc_loss,
                model_name = "first_lightgbm",
                rank=False,
                calc_importances=False)

Training until validation scores don't improve for 1000 rounds
[500]	valid's rmse: 212.483
[1000]	valid's rmse: 213.629
Early stopping, best iteration is:
[355]	valid's rmse: 212.148


train:catboost

In [None]:
y_pred_valid, y_pred_test, train_loss, valid_loss=train_cat(x_train, y_train, x_valid, y_valid,test,categorical_features,
                                                            model_name="first_catboost",loss_func=calc_loss,rank=False)

0:	learn: 255.7499711	test: 259.7253080	best: 259.7253080 (0)	total: 2.64s	remaining: 3h 40m 6s
100:	learn: 212.7577879	test: 218.3891794	best: 218.3108245 (98)	total: 3m 50s	remaining: 3h 5m 57s
200:	learn: 209.0461561	test: 216.1953789	best: 216.1953789 (200)	total: 7m 49s	remaining: 3h 6m 53s
300:	learn: 206.5847620	test: 215.4191065	best: 215.3752401 (297)	total: 11m 57s	remaining: 3h 6m 45s
400:	learn: 204.7432490	test: 215.1324990	best: 214.9976273 (373)	total: 16m 1s	remaining: 3h 3m 44s
500:	learn: 202.8132268	test: 214.8505865	best: 214.8241692 (495)	total: 20m 23s	remaining: 3h 3m 5s


SUBMIT

In [73]:
sub = pd.read_csv("./test.csv")
sub['imp'] = y_pred_test
sub[["cue_point_id","cue_point_sequence","cluster_cf_id","imp"]].to_csv('submission_cat.csv', index=False)