In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.metrics import mean_squared_error, mean_absolute_error, average_precision_score, f1_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, confusion_matrix,classification_report


def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

from datetime import datetime
import time
import sys
import gc
import pickle
sys.version_info

import joblib
import pickle

## Import Dataset

In [None]:
login = pd.read_csv('../input/iamthebestcoderopen2020/login.csv')
purchase_detail = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
user_info = pd.read_csv('../input/iamthebestcoderopen2020/user_info.csv')
user_label_train = pd.read_csv('../input/iamthebestcoderopen2020/user_label_train.csv')
submission = pd.read_csv('../input/iamthebestcoderopen2020/submission.csv')

## Preprocess personal info

In [None]:
purchase_detail

In [None]:
user_info

In [None]:
login

In [None]:
user_label_train

In [None]:
df = pd.merge(user_info, user_label_train, how='left', on='userid')

### Birth date data cleaning

In [None]:
user_info.birth_year.describe()

In [None]:
## People younger the 5 or order than 70 are not likely to use Shopee
problem = (df['birth_year']> 2015) | (df['birth_year']< 1950)
## Impute with median
df.loc[problem,'birth_year'] = df['birth_year'].median()

In [None]:
pd.to_datetime(df.enroll_time).describe()

### Convert enroll date to weeks since register

In [None]:
df['enroll_time'] = pd.to_datetime(df.enroll_time)
dataset_time = '2015-05-27'
df['enroll_weeks'] = (df['enroll_time'] - np.datetime64(dataset_time))/np.timedelta64(1, 'D')/52

In [None]:
df

### Data Mark on training and testing

In [None]:
df['data_type'] = np.where((df['label']==1)|(df['label']==0), 1, 2)

# Feature Engineering

## Sales Data Engineering

### Convert datetime data into month

In [None]:
purchase_detail = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')

import datetime as dt
purchase_detail['date'] = purchase_detail['grass_date'].apply( lambda x: np.datetime64(x))
purchase_detail['month'] = purchase_detail['date'].dt.month

### Get monthly sales for each user

In [None]:
purchase = purchase_detail.groupby(['userid','month'], as_index=False).agg({'order_count': 'mean','total_amount': 'mean'})
purchase = purchase.pivot(index='userid', columns='month', values='total_amount').reset_index()
purchase.columns = ['userid', 'total_2','total_3','total_4','total_5','total_6','total_7']
purchase = purchase.fillna(0)

In [None]:
df = pd.merge(df,purchase, how='left', on='userid')

In [None]:
##Total amount for each month
d1 = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
d1['date'] = d1['grass_date'].apply( lambda x: np.datetime64(x))
d1['month'] = d1['date'].dt.month
d1 = d1.groupby(['userid','month'], as_index=False).agg({'order_count': 'mean','total_amount': 'sum'})
d1 = d1.pivot(index='userid', columns='month', values='total_amount').reset_index()
d1.columns = ['userid', 'total_2a','total_3a','total_4a','total_5a','total_6a','total_7a']
d1 = d1.fillna(0)

In [None]:
df = pd.merge(df, d1, how='left', on='userid')

In [None]:
##Order count
d2 = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
d2['date'] = d2['grass_date'].apply( lambda x: np.datetime64(x))
d2['month'] = d2['date'].dt.month
d2 = d2.groupby(['userid','month'], as_index=False).agg({'order_count': 'sum','total_amount': 'sum'})
d2 = d2.pivot(index='userid', columns='month', values='order_count').reset_index()
d2.columns = ['userid', 'order_2a','order_3a','order_4a','order_5a','order_6a','order_7a']
d2 = d2.fillna(0)

In [None]:
df = pd.merge(df, d2, how='left', on='userid')

### Get sales of each category for each user

In [None]:
data = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
data = data.groupby(['userid','category_encoded'], as_index=False).size()

In [None]:
data = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
data = data.groupby(['userid','category_encoded'], as_index=False).size()
data = data.pivot(index='userid', columns='category_encoded', values='size').reset_index()
data = data.fillna(0)

In [None]:
df = pd.merge(df, data, how='left', on='userid')

## Login Data Engineering

In [None]:
login = pd.read_csv('../input/iamthebestcoderopen2020/login.csv')

login['date'] = pd.to_datetime(login['date'])
login['login_mm'] = login['date'].dt.month

#初步月整檔
login=login.groupby(['userid','login_mm'], as_index=False).agg({'login_times': 'mean'})

login = login.pivot(index='userid', columns='login_mm', values=['login_times']).reset_index()
login.columns = ['userid', 'avg_login_2','avg_login_3','avg_login_4','avg_login_5','avg_login_6','avg_login_7']

login.head()

In [None]:
df = pd.merge(df, login, how='left', on='userid')

In [None]:
login = pd.read_csv('../input/iamthebestcoderopen2020/login.csv')

login['date'] = pd.to_datetime(login['date'])
login['login_mm'] = login['date'].dt.month


#初步月整檔
login=login.groupby(['userid','login_mm'], as_index=False).agg({'login_times': 'sum'})

login2 = login.pivot(index='userid', columns='login_mm', values=['login_times']).reset_index()
login2.columns = ['userid', 'tot_login_2','tot_login_3','tot_login_4','tot_login_5','tot_login_6','tot_login_7']

login2.head()

In [None]:
df = pd.merge(df, login2, how='left', on='userid')

In [None]:
df.head()

In [None]:
df['rate_login_56']=df['avg_login_6']/df['avg_login_5']+0.00000000001
df['rate_login_67']=df['avg_login_7']/df['avg_login_6']+0.00000000001
df['rate_login_45']=df['avg_login_5']/df['avg_login_4']+0.00000000001
df['rate_login_27']=df['avg_login_7']/df['avg_login_2']+0.00000000001

df.head()

In [None]:
#TXN的rate
df['rate_txn_56']=df['total_6']/df['total_5']+0.00000000001
df['rate_total_67']=df['total_7']/df['total_6']+0.00000000001
df['rate_total_45']=df['total_5']/df['total_4']+0.00000000001
df['rate_total_27']=df['total_7']/df['total_2']+0.00000000001

df.head()

In [None]:
d1['rate_txn_56']=d1['total_6a']/d1['total_5a']+0.00000000001
d1['rate_total_67']=d1['total_7a']/d1['total_6a']+0.00000000001
d1['rate_total_45']=d1['total_5a']/d1['total_4a']+0.00000000001
d1['rate_total_27']=d1['total_7a']/d1['total_2a']+0.00000000001

In [None]:
d2 = pd.read_csv('../input/iamthebestcoderopen2020/purchase_detail.csv')
d2['date'] = d2['grass_date'].apply( lambda x: np.datetime64(x))
d2['month'] = d2['date'].dt.month
d2 = d2.groupby(['userid','month'], as_index=False).agg({'order_count': 'sum','total_amount': 'sum'})
d2 = d2.pivot(index='userid', columns='month', values='order_count').reset_index()
d2.columns = ['userid', 'order_2a','order_3a','order_4a','order_5a','order_6a','order_7a']
d2 = d2.fillna(0)

In [None]:
df = pd.merge(df, d2, how='left', on='userid')

In [None]:
df.head()

In [None]:
df_all = df.copy()

# Modeling

In [None]:
exclude_col = [
    'userid',
    'data_type',
    'enroll_time'

]

y_col = ['label']


In [None]:
x_col = list(set(df_all.columns.tolist())-set(exclude_col)-set(y_col))

In [None]:
# x_train = df_all[df_all['data_type']==1][x_col]
# y_train = df_all[df_all['data_type']==1][y_col]

# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.3, random_state = 1)

In [None]:
# train_dm = xgb.DMatrix(data=x_train, label=y_train)
# valid_dm = xgb.DMatrix(data=x_valid, label=y_valid)

In [None]:
params ={ 'booster':'gbtree',
              
# 二元分類 輸出機率
          'binary':'logistic',   
          'eval_metric': 'auc',
#           'eval_metric': 'rmse',
         
          'max_depth': 7,
          'min_child_weight':100, 
          'gamma':0, 
          'scale_pos_weight':7,
           
          'subsample':0.8,
          'colsample_bytree':0.8,
          'eta':0.1,  
         
          'random_state':7,
          'silent':0 
        }


# evallist = [(train_dm,'train'), (valid_dm,'eval')]
# evals_result = {}

In [None]:
x_train = df_all[df_all['data_type']==1][x_col]
y_train = df_all[df_all['data_type']==1][y_col]
train_dm = xgb.DMatrix(data=x_train, label=y_train)

In [None]:
model = xgb.train(params, train_dm, num_boost_round=300)

In [None]:
# save model
pickle.dump(model, open('model_v4.pickle', 'wb'))

## Feature Importance

In [None]:
plot_features(model, (10,14))

## Prediction

In [None]:
x_test = df_all[df_all['data_type']==2][x_col]
test_dm = xgb.DMatrix(data = x_test)

y_pred = model.predict(test_dm)
y_pred

### Submission

In [None]:
df_y = pd.DataFrame()
df_y['userid'] = df_all[df_all['data_type']==2]['userid']
df_y['label'] = y_pred

In [None]:
df_y.describe()

In [None]:
#Make sure final output with 0~1
df_y['label'] = (df_y['label']-min(df_y['label']))/(max(df_y['label'])-min(df_y['label']))

In [None]:
df_y.describe()

In [None]:
# df_y['label'] = np.where(df_y['label']>1, 1, np.where(df_y['label']<0, 0, df_y['label']))

In [None]:
df_y.to_csv('submission4.csv', index=False)