In [1]:
import numpy as np
import pandas as pd
import time
import warnings
import gc
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn import preprocessing
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

### 将数据进行格式化处理

In [2]:
# Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

### 简单处理train和test数据

In [3]:
test_label = pd.read_csv("../../data/6911+6912+0103163343_2.csv")

In [4]:
test_label["label"] = 0
test_label.loc[(test_label['target'] >= 1.499360), 'label'] = 1
test_label["target"] = -9

In [5]:
train = pd.read_csv('../../data/train.csv', parse_dates=["first_active_month"])
train["label"] = 0
train.loc[(train['target'] < -30), 'label'] = 1
train["target"] = 9
test = pd.read_csv('../../data/test.csv', parse_dates=["first_active_month"])
test = test.merge(test_label, on="card_id", how="left")
data = pd.concat([train, test], axis=0)
data["month"] = data["first_active_month"].apply(lambda x: x.month)
data["day"] = data["first_active_month"].apply(lambda x: x.day)
data["dayofyear"] = data["first_active_month"].apply(lambda x: x.dayofyear)
data['week'] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data['days'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days
data["quarter"] = data["first_active_month"].apply(lambda x: x.quarter)
data["days_feature1"] = data["days"] * data["feature_1"]
data["days_feature2"] = data["days"] * data["feature_2"]
data["days_feature3"] = data["days"] * data["feature_3"]
print(data.shape)
print(data['label'].value_counts())

(325540, 17)
0    321967
1      3573
Name: label, dtype: int64


In [6]:
data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,label,month,day,dayofyear,week,dayofweek,days,quarter,days_feature1,days_feature2,days_feature3
0,2017-06-01,C_ID_92a2005557,5,2,1,9,0,6.0,1.0,152.0,22.0,3.0,245.0,2.0,1225.0,490.0,245.0
1,2017-01-01,C_ID_3d0044924f,4,1,0,9,0,1.0,1.0,1.0,52.0,6.0,396.0,1.0,1584.0,396.0,0.0
2,2016-08-01,C_ID_d639edf6cd,2,2,0,9,0,8.0,1.0,214.0,31.0,0.0,549.0,3.0,1098.0,1098.0,0.0
3,2017-09-01,C_ID_186d6a6901,4,3,0,9,0,9.0,1.0,244.0,35.0,4.0,153.0,3.0,612.0,459.0,0.0
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,9,0,11.0,1.0,305.0,44.0,2.0,92.0,4.0,92.0,276.0,0.0


### 特征工程

In [7]:
def pro_trans(trans): 
    """
    Simple processing historical_transactions and new_transactions and extract features.
    """
    trans["authorized_flag"] = trans["authorized_flag"].map({"Y": 1, "N": 0})
    trans["category_1"] = trans["category_1"].map({"Y": 1, "N":0})
    trans["purchase_date"] = pd.to_datetime(trans["purchase_date"])
    trans["month"] = trans["purchase_date"].apply(lambda x: x.month)
    trans["weekofyear"] = trans["purchase_date"].apply(lambda x: x.weekofyear)
    trans["dayofweek"] = trans["purchase_date"].apply(lambda x: x.dayofweek)
    trans["weekend"] = (trans["purchase_date"].apply(lambda x: x.dayofweek) >= 5).astype(int)
    trans["hour"] = trans["purchase_date"].apply(lambda x: x.hour)
    trans["quarter"] = trans["purchase_date"].apply(lambda x: x.quarter)
    trans["minute"] = trans["purchase_date"].apply(lambda x: x.minute)
    trans["month_diff"] = ((datetime.datetime.today() - trans["purchase_date"]).apply(lambda x: x.days)) // 30
    trans["month_diff"] += trans["month_lag"]
    trans["month_diff2"] = trans["month"] - trans["month_lag"]
    trans["category_2"] = trans["category_2"].fillna(value = 2.0)
    trans["category_3"] = trans["category_3"].fillna(value = "A")
    trans["merchant_id"] = trans["merchant_id"].fillna(value = "M_ID_00a6ca8a8a")
    lbl = preprocessing.LabelEncoder()
    trans["category_3"] = lbl.fit_transform(list(trans["category_3"].values))
    
    for col in ["category_2", "category_3"]:
        trans[col+"_pa_mean"] = trans["purchase_amount"].groupby(trans[col]).agg("mean")
        trans[col+"_pa_max"] = trans["purchase_amount"].groupby(trans[col]).agg("max")
        trans[col+"_pa_min"] = trans["purchase_amount"].groupby(trans[col]).agg("min")
        trans[col+"_pa_var"] = trans["purchase_amount"].groupby(trans[col]).agg("var")
        trans[col+"_im_mean"] = trans["installments"].groupby(trans[col]).agg("mean")
        trans[col+"_im_max"] = trans["installments"].groupby(trans[col]).agg("max")
        trans[col+"_im_min"] = trans["installments"].groupby(trans[col]).agg("min")
        trans[col+"_im_var"] = trans["installments"].groupby(trans[col]).agg("var")
    trans_data = trans
    
    return trans_data

In [8]:
# Taking Reference from Other Kernels
def trans_agg(trans, nunique_col, prefix):
    agg_func = {"purchase_date":["max", "min"],
                "month_diff": ["max", "min", "mean", "var"],
                "weekend": ["max", "min", "mean", "sum"],
                "authorized_flag": ["max", "min", "mean", "sum"],
                "category_1": ["max", "min", "mean", "sum"],
                "category_2": ["max", "min", "mean", "sum"],
                "category_3": ["max", "min", "mean", "sum"],
                "installments": ["max", "min", "mean", "std", "sum"],
                "purchase_amount": ["max", "min", "mean", "std", "sum"],
                "month_lag": ["mean", "max", "min", "nunique", "var"],
                "month_diff": ["mean", "max", "min", "nunique", "var"],
                "card_id": ["size", "nunique"],
                "month": ["max", "min", "nunique"],
                "hour": ["max", "min", "nunique"],
                "weekofyear": ["max", "min", "nunique"],
                "dayofweek": ["max", "min", "nunique"],
                "merchant_id": ["nunique"],
                "city_id": ["nunique"],
                "state_id": ["nunique"],
                "subsector_id": ["max", "min", "nunique"],
                "merchant_category_id": ["max", "min", "nunique"]}
    agg_trans = trans.groupby([nunique_col]).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby(nunique_col).size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on=nunique_col, how='left')
    
    agg_trans[prefix + "purchase_date_max"] = pd.to_datetime(agg_trans[prefix + "purchase_date_max"])
    agg_trans[prefix + "purchase_date_min"] = pd.to_datetime(agg_trans[prefix + "purchase_date_min"])
    agg_trans[prefix + "purchase_date_diff"] = (agg_trans[prefix + "purchase_date_max"] - agg_trans[prefix + "purchase_date_min"]).dt.days   
    agg_trans[prefix + "purchase_date_average"] = agg_trans[prefix + "purchase_date_diff"] / agg_trans[prefix + "card_id_size"]
    agg_trans[prefix + "purchase_date_uptonow"] = (datetime.datetime.today() - agg_trans[prefix + "purchase_date_max"]).dt.days
    for feature in [prefix + "purchase_date_max", prefix + "purchase_date_min"]:
        agg_trans[feature] = agg_trans[feature].astype(np.int64) * 1e-9
        
    return agg_trans

### 处理new_merchant_transactions中提特征

In [9]:
new_trans = reduce_mem_usage(pd.read_csv('../../data/new_merchant_transactions.csv'))

Mem. usage decreased to 114.20 Mb (45.5% reduction)


In [10]:
new_trans_pro = pro_trans(new_trans)

In [11]:
new_card_trans = trans_agg(new_trans_pro, nunique_col="card_id", prefix='new_')
print(new_card_trans.shape)

(290001, 70)


#### 通过点击预估率开发特征

In [14]:
new_train_data = new_trans.merge(data, on="card_id", how="left")

In [16]:
new_card = new_train_data.groupby("card_id", as_index=False)["label"].agg({"new_card_sum": "sum", "new_card_cnt": "count"})
new_card["card_cvr"] = (new_card["new_card_sum"]) / (new_card["new_card_cnt"] + 3)
new_merchant = new_train_data.groupby("merchant_id", as_index=False)["label"].agg({"new_merchant_sum": "sum", "new_merchant_cnt": "count"})
new_merchant["merchant_cvr"] = (new_merchant["new_merchant_sum"]) / (new_merchant["new_merchant_cnt"] + 3)
new_city = new_train_data.groupby("city_id", as_index=False)["label"].agg({"new_city_sum": "sum", "new_city_cnt": "count"})
new_city["city_cvr"] = (new_city["new_city_sum"]) / (new_city["new_city_cnt"] + 3)
new_mertcate = new_train_data.groupby("merchant_category_id", as_index=False)["label"].agg({"new_mertcate_sum": "sum", "new_mertcate_cnt": "count"})
new_mertcate["mertcate_cvr"] = (new_mertcate["new_mertcate_sum"]) / (new_mertcate["new_mertcate_cnt"] + 3)
new_state = new_train_data.groupby("state_id", as_index=False)["label"].agg({"new_state_sum": "sum", "new_state_cnt": "count"})
new_state["state_cvr"] = (new_state["new_state_sum"]) / (new_state["new_state_cnt"] + 3)
new_subsector = new_train_data.groupby("subsector_id", as_index=False)["label"].agg({"new_subsector_sum": "sum", "new_subsector_cnt": "count"})
new_subsector["subsector_cvr"] = (new_subsector["new_subsector_sum"]) / (new_subsector["new_subsector_cnt"] + 3)


#### 将点击通过率特征进行整合

In [17]:
new_data = new_train_data.merge(new_card, on="card_id", how="left")
new_data = new_data.merge(new_merchant, on="merchant_id", how="left")
new_data = new_data.merge(new_city, on="city_id", how="left")
new_data = new_data.merge(new_mertcate, on="merchant_category_id", how="left")
new_data = new_data.merge(new_state, on="state_id", how="left")
new_data = new_data.merge(new_subsector, on="subsector_id", how="left")

#### 对点击通过率开发排名特征

In [19]:
new_data["card_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')

new_data["merchant_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')

new_data["city_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')

new_data["mertcate_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')

new_data["state_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')
new_data["state_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')
new_data["state_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')
new_data["state_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')
new_data["state_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')
new_data["state_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["state_cvr"].rank(ascending=False,method='dense')

new_data["subsector_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')
new_data["subsector_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')
new_data["subsector_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')
new_data["subsector_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')
new_data["subsector_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')
new_data["subsector_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["subsector_cvr"].rank(ascending=False,method='dense')



### 从history_transactions中提取特征

In [None]:
hist_trans = reduce_mem_usage(pd.read_csv('../../data/historical_transactions.csv'))

In [None]:
hist_trans_pro = pro_trans(hist_trans)