In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

def top_merge(df, n=10, column="predict", merge_column="COUPON_ID_hash"):
    '''
    get top n row

    :param pandas.DataFrame df:
    :param int n:
    :param str column:
    :rtype: pandas.DataFrame
    '''

    return " ".join(df.sort_index(by=column)[-n:][merge_column])

In [2]:
category = ['GENRE_NAME','PREF_MATCH']
continuous = ['PRICE_RATE','VALIDPERIOD','DISPPERIOD']
depth = 5
n_estimators = 10

In [3]:
# import csv
user_df = pd.read_csv("../data/user_list.csv")
train_coupon_df = pd.read_csv("../data/coupon_list_train.csv")
train_visit_df = pd.read_csv("../data/coupon_visit_train.csv")
test_coupon_df = pd.read_csv("../data/coupon_list_test.csv")

In [4]:
# create train_df
train_df = pd.merge(train_visit_df, train_coupon_df,
                    left_on="VIEW_COUPON_ID_hash", right_on="COUPON_ID_hash")
train_df = pd.merge(train_df, user_df,
                    left_on="USER_ID_hash", right_on="USER_ID_hash")

In [5]:
del train_visit_df
del train_coupon_df

In [6]:
# create test_df
test_coupon_df["cross"] = 1
user_df["cross"] = 1
test_df = pd.merge(test_coupon_df, user_df, on="cross")

In [7]:
del test_coupon_df
del user_df

# GENRE_NAME, DISCOUNT_PRICE, KEN_NAME, AGE, SEX_ID

In [8]:
train_df.columns

Index(['PURCHASE_FLG', 'I_DATE', 'PAGE_SERIAL', 'REFERRER_hash',
       'VIEW_COUPON_ID_hash', 'USER_ID_hash', 'SESSION_ID_hash',
       'PURCHASEID_hash', 'Unnamed: 0_x', 'CAPSULE_TEXT', 'GENRE_NAME',
       'PRICE_RATE', 'CATALOG_PRICE', 'DISCOUNT_PRICE', 'DISPFROM', 'DISPEND',
       'DISPPERIOD', 'VALIDFROM', 'VALIDEND', 'VALIDPERIOD', 'USABLE_DATE_MON',
       'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU',
       'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN',
       'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY', 'large_area_name',
       'ken_name', 'small_area_name', 'COUPON_ID_hash', 'Unnamed: 0_y',
       'REG_DATE', 'SEX_ID', 'AGE', 'WITHDRAW_DATE', 'PREF_NAME'],
      dtype='object')

In [9]:
test_df.columns

Index(['Unnamed: 0_x', 'CAPSULE_TEXT', 'GENRE_NAME', 'PRICE_RATE',
       'CATALOG_PRICE', 'DISCOUNT_PRICE', 'DISPFROM', 'DISPEND', 'DISPPERIOD',
       'VALIDFROM', 'VALIDEND', 'VALIDPERIOD', 'USABLE_DATE_MON',
       'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU',
       'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN',
       'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY', 'large_area_name',
       'ken_name', 'small_area_name', 'COUPON_ID_hash', 'cross',
       'Unnamed: 0_y', 'REG_DATE', 'SEX_ID', 'AGE', 'WITHDRAW_DATE',
       'PREF_NAME', 'USER_ID_hash'],
      dtype='object')

In [10]:
# length = len(train_df)
# a = train_df['GENRE_NAME']
# b = train_df['DISCOUNT_PRICE'].values.reshape(length,1)
# c = train_df['ken_name']
# d = train_df['SEX_ID']

In [11]:
# length = len(test_df)
# A = test_df['GENRE_NAME']
# B = test_df['DISCOUNT_PRICE'].values.reshape(length, 1)
# C = test_df['ken_name']
# D = test_df['SEX_ID']

In [12]:
def category_to_ohe(train_col, test_col):
    le = LabelEncoder()
    le.fit(train_col)
    
    labeled_train_col = le.transform(train_col)
    labeled_train_col = labeled_train_col.reshape(len(labeled_train_col),1)
    
    labeled_test_col = le.transform(test_col)
    labeled_test_col = labeled_test_col.reshape(len(labeled_test_col),1)
    
    ohe = OneHotEncoder()
    ohe.fit(labeled_train_col)
    
    ohe_train_col = ohe.transform(labeled_train_col).toarray()
    ohe_test_col = ohe.transform(labeled_test_col).toarray()
    
    return ohe_train_col, ohe_test_col

In [13]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_df['VALIDPERIOD'] = imp.fit_transform(train_df['VALIDPERIOD'].values.reshape(len(train_df),1))
test_df['VALIDPERIOD'] = imp.fit_transform(test_df['VALIDPERIOD'].values.reshape(len(test_df),1))

In [14]:
# usable_date = ['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU',
#                'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY']

# for each in usable_date:
#     tmp = train_df[each]
#     tmp[tmp==2] = 1
#     tmp[tmp.isnull()] = 1

#     tmp = test_df[each]
#     tmp[tmp==2] = 1
#     tmp[tmp.isnull()] = 1

In [15]:
index = train_df['PREF_NAME'] == train_df['ken_name']
train_df['PREF_MATCH'] = index
train_df['PREF_MATCH'] = train_df['PREF_MATCH'].apply(lambda x: 1 if x == True else 0)

In [16]:
index = test_df['PREF_NAME'] == test_df['ken_name']
test_df['PREF_MATCH'] = index
test_df['PREF_MATCH'] = test_df['PREF_MATCH'].apply(lambda x: 1 if x == True else 0)

In [17]:
train_cols, test_cols = [], []

In [18]:
# train_df['twenty'] = train_df['AGE'].apply(lambda age: 1 if 20<=age<30 else 0)
# train_df['thirty'] = train_df['AGE'].apply(lambda age: 1 if 30<=age<40 else 0)
# train_df['forty'] = train_df['AGE'].apply(lambda age: 1 if 40<=age<50 else 0)
# train_df['fifty'] = train_df['AGE'].apply(lambda age: 1 if 50<=age<60 else 0)

# test_df['twenty'] = test_df['AGE'].apply(lambda age: 1 if 20<=age<30 else 0)
# test_df['thirty'] = test_df['AGE'].apply(lambda age: 1 if 30<=age<40 else 0)
# test_df['forty'] = test_df['AGE'].apply(lambda age: 1 if 40<=age<50 else 0)
# test_df['fifty'] = test_df['AGE'].apply(lambda age: 1 if 50<=age<60 else 0)

In [19]:
for cat in category:
    train_tok, test_tok = category_to_ohe(train_df[cat],test_df[cat])
    train_cols.append(train_tok)
    test_cols.append(test_tok)
    print(cat,' finished !')

GENRE_NAME  finished !
PREF_MATCH  finished !


In [20]:
for con in continuous:
    train_cols.append(train_df[con].values.reshape(len(train_df),1))
    test_cols.append(test_df[con].values.reshape(len(test_df),1))

In [21]:
X_train = np.hstack(tuple(each for each in train_cols))
X_test = np.hstack(tuple(each for each in test_cols))
y_train = train_df['PURCHASE_FLG']

In [22]:
del train_cols
del test_cols

In [23]:
del train_df

In [24]:
clf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimators)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
from datetime import datetime
# predict test data
predict_proba = clf.predict_proba(X_test)

# 클래스가 1이면 산다라고 분류a
# 그러니까 아래 코드는 사는 놈들의 스코어를 위한 인덱스
pos_idx = np.where(clf.classes_ == True)[0][0]

now = datetime.now()
cur_time = now.strftime('%Y-%m-%d %H-%M-%S')

test_df["predict"] = predict_proba[:, pos_idx]
top10_coupon = test_df.groupby("USER_ID_hash").apply(top_merge)
top10_coupon.name = "PURCHASED_COUPONS"
top10_coupon.to_csv("submission/random_forest"+ cur_time +".csv", header=True)



In [26]:
predict_proba = clf.predict_proba(X_test)
test_df['predict'] = predict_proba[:, pos_idx]

In [27]:
# predict_proba[:,pos_idx]

In [28]:
# test_df.sort_index(by='predict')[-10:]

In [29]:
# test_df.sort_index(by='predict')

In [30]:
clf.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
y_train.values

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [32]:
len(y_train.values == clf.predict(X_train))/len(y_train)

1.0

In [33]:
y_train.values

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [34]:
clf.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [35]:
a = y_train.values == clf.predict(X_train)

In [36]:
len(a[a==True])

2394817

In [37]:
len(a[a==True]) / len(y_train.values)

0.9513790289710099

In [38]:
from sklearn.metrics import *

confusion_matrix(y_train.values, clf.predict(X_train))

array([[2394817,       0],
       [ 122389,       0]], dtype=int64)

In [39]:
print(classification_report(y_train.values, clf.predict(X_train)))

             precision    recall  f1-score   support

          0       0.95      1.00      0.98   2394817
          1       0.00      0.00      0.00    122389

avg / total       0.91      0.95      0.93   2517206



  'precision', 'predicted', average, warn_for)
