In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

class Get_Price_Rate(BaseEstimator, TransformerMixin):
    '''
    get price rate
    '''

    def get_feature_names(self):

        return [self.__class__.__name__]

    def fit(self, date_frame, y=None):
        '''
        fit

        :param pandas.DataFrame: all data
        :rtype: Get_Price_Rate
        '''

        return self

    def transform(self, date_frame):
        '''
        transform

        :param pandas.DataFrame: all data
        :rtype: array
        '''

        return date_frame["PRICE_RATE"].as_matrix()[None].T.astype(np.float)


class Get_Match_Pref(BaseEstimator, TransformerMixin):
    '''
    get user pref is match coupon area
    '''

    def get_feature_names(self):

        return [self.__class__.__name__]

    def fit(self, date_frame, y=None):
        '''
        fit

        :param pandas.DataFrame: all data
        :rtype: Get_Price_Rate
        '''

        return self

    def transform(self, date_frame):
        '''
        transform

        :param pandas.DataFrame: all data
        :rtype: array
        '''
        res_sr = date_frame["PREF_NAME"] == date_frame["ken_name"]

        return res_sr.as_matrix()[None].T.astype(np.float)


def top_merge(df, n=10, column="predict", merge_column="COUPON_ID_hash"):
    '''
    get top n row

    :param pandas.DataFrame df:
    :param int n:
    :param str column:
    :rtype: pandas.DataFrame
    '''

    return " ".join(df.sort_index(by=column)[-n:][merge_column])

feature_list = [
    ('PRICE_RATE', Get_Price_Rate()),
    ('MATCH_PREF', Get_Match_Pref()),
]

In [2]:
# import csv
user_df = pd.read_csv("../data/user_list.csv")
train_coupon_df = pd.read_csv("../data/coupon_list_train.csv")
train_visit_df = pd.read_csv("../data/coupon_visit_train.csv")
test_coupon_df = pd.read_csv("../data/coupon_list_test.csv")

In [3]:
# create train_df
train_df = pd.merge(train_visit_df, train_coupon_df,
                    left_on="VIEW_COUPON_ID_hash", right_on="COUPON_ID_hash")
train_df = pd.merge(train_df, user_df,
                    left_on="USER_ID_hash", right_on="USER_ID_hash")

In [4]:
%whos

Variable             Type         Data/Info
-------------------------------------------
BaseEstimator        type         <class 'sklearn.base.BaseEstimator'>
FeatureUnion         ABCMeta      <class 'sklearn.pipeline.FeatureUnion'>
Get_Match_Pref       type         <class '__main__.Get_Match_Pref'>
Get_Price_Rate       type         <class '__main__.Get_Price_Rate'>
LabelEncoder         type         <class 'sklearn.preproces<...>sing.label.LabelEncoder'>
LogisticRegression   type         <class 'sklearn.linear_mo<...>stic.LogisticRegression'>
OneHotEncoder        type         <class 'sklearn.preproces<...>sing.data.OneHotEncoder'>
TransformerMixin     type         <class 'sklearn.base.TransformerMixin'>
autopep8             module       <module 'autopep8' from '<...>e-packages\\autopep8.py'>
feature_list         list         n=2
np                   module       <module 'numpy' from 'D:\<...>ges\\numpy\\__init__.py'>
os                   module       <module 'os' from 'D:\\anaconda\\li

In [5]:
del train_visit_df
del train_coupon_df

# GENRE_NAME, DISCOUNT_PRICE, KEN_NAME, AGE, SEX_ID

In [35]:
length = len(train_df)

In [7]:
a = train_df['GENRE_NAME'].values.reshape(length, 1)
B = train_df['DISCOUNT_PRICE'].values.reshape(length, 1)
c = train_df['ken_name'].values.reshape(length, 1)
d = train_df['SEX_ID'].values.reshape(length, 1)
y_train = train_df['PURCHASE_FLG'].values.reshape(length, 1)

In [50]:
a = train_df['GENRE_NAME']
b = train_df['DISCOUNT_PRICE'].values.reshape(length,1)
c = train_df['ken_name']
d = train_df['SEX_ID']
y_train = train_df['PURCHASE_FLG']

In [9]:
le = [LabelEncoder() for _ in range(0,4)]

In [10]:
ohe = [OneHotEncoder() for _ in range(0,4)]

In [72]:
tmp = LabelEncoder()

In [73]:
tmp.fit(a)

LabelEncoder()

In [81]:
wow = tmp.transform(a)
wow = wow.reshape(len(wow),1)

In [77]:
aa = test_df['GENRE_NAME']

In [90]:
wow1 = tmp.transform(aa)
wow1 = wow1.reshape(len(wow1),1)

In [82]:
tmp1 = OneHotEncoder(n_values=13)

In [84]:
tmp1.fit(wow)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values=13, sparse=True)

In [87]:
tmp1.transform(wow).toarray()[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [92]:
tmp1.transform(wow1).toarray()[0]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [11]:
def category_to_ohe(column,le,ohe):
    '''
    Converting category column into one-hot-encoded for X_train
    '''
    le.fit(column)
    labled = le.transform(column).reshape(length,1)
    
    ohe.fit(labled)
    
    return ohe.transform(labled).toarray()

In [17]:
converted = [category_to_ohe(a,le[0],ohe[0]), b , category_to_ohe(c,le[2],ohe[2]), category_to_ohe(d,le[3],ohe[3])]

In [18]:
len(converted[0][0])

13

In [19]:
len(converted[2][0])

47

In [20]:
len(converted[3][0])

2

In [21]:
X_train = np.hstack((converted[0], converted[1], converted[2], converted[3]))

In [22]:
del converted

In [23]:
# fit model
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
# create test_df
test_coupon_df["cross"] = 1
user_df["cross"] = 1
test_df = pd.merge(test_coupon_df, user_df, on="cross")

In [25]:
del test_coupon_df
del user_df

In [26]:
length = len(test_df)
a = test_df['GENRE_NAME']
b = test_df['DISCOUNT_PRICE'].values.reshape(length,1)
c = test_df['ken_name']
d = test_df['SEX_ID']

In [27]:
converted = [category_to_ohe(a,le[0],ohe[0]), b , category_to_ohe(c,le[2],ohe[2]), category_to_ohe(d,le[3],ohe[3])]

In [28]:
len(converted[0][0])

12

In [29]:
len(converted[2][0])

34

In [30]:
len(converted[3][0])

2

In [31]:
X_test = np.hstack((converted[0], converted[1], converted[2], converted[3]))

In [34]:
# predict test data
predict_proba = clf.predict_proba(X_test)

# 클래스가 1이면 산다라고 분류
# 그러니까 아래 코드는 사는 놈들의 스코어를 위한 인덱스
pos_idx = np.where(clf.classes_ == True)[0][0]

test_df["predict"] = predict_proba[:, pos_idx]
top10_coupon = test_df.groupby("USER_ID_hash").apply(top_merge)
top10_coupon.name = "PURCHASED_COUPONS"
top10_coupon.to_csv("submission.csv", header=True)

ValueError: X has 49 features per sample; expecting 63

In [36]:
predict_proba

array([[0.96064285, 0.03935715],
       [0.96064285, 0.03935715],
       [0.96064285, 0.03935715],
       ...,
       [0.9624751 , 0.0375249 ],
       [0.9624751 , 0.0375249 ],
       [0.9624751 , 0.0375249 ]])

In [34]:
predict_proba[:,pos_idx]

array([0.03935715, 0.03935715, 0.03935715, ..., 0.0375249 , 0.0375249 ,
       0.0375249 ])

In [32]:
test_df.sort_index(by='predict')

  """Entry point for launching an IPython kernel.


Unnamed: 0,Unnamed: 0_x,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,...,COUPON_ID_hash,cross,Unnamed: 0_y,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,PREF_NAME,USER_ID_hash,predict
7090629,309,Hotel,Hotel and Japanese hotel,50,28000,14000,2012-06-29 12:00:00,2012-07-03 12:00:00,4,2012-07-09,...,f9c657ce7ca80b3766ced3a9a3c709bb,1,22872,2011-02-24 15:43:18,f,38,,Saitama Prefecture,280f0cedda5c4b171ee6245889659571,0.037525
4014015,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11240,2011-05-11 06:38:34,m,71,,Fukuoka Prefecture,d472b35172ea017a27727877eed56e77,0.037525
4014016,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11241,2011-05-17 23:38:30,f,25,,Aichi Prefecture,8202edc429ee5e2289cc089e7208e150,0.037525
4014017,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11242,2010-11-10 12:50:26,m,56,,Fukuoka Prefecture,9090c7fb6426630c92daa06b6b7b2386,0.037525
4014018,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11243,2011-06-28 21:11:56,f,34,,Fukuoka Prefecture,623ea4c917f7513f29b75e93f4141cb6,0.037525
4014020,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11245,2012-03-31 23:06:39,f,56,,Aichi Prefecture,8f11819752d3200442f5a73fe830a107,0.037525
4014021,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11246,2012-04-07 09:23:35,f,52,,Kanagawa Prefecture,287699c5ff9db69ecc268d0386dc5092,0.037525
4014022,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11247,2012-02-22 13:48:28,m,50,,,b4ac740acbcba0f52ed6af7a97bdda2a,0.037525
4014023,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11248,2011-08-11 17:01:02,f,57,,Kanagawa Prefecture,b4e670e2c832fb50f1883e857844e22f,0.037525
4014024,175,Japanese hotel,Hotel and Japanese hotel,50,14700,7350,2012-06-27 12:00:00,2012-07-01 12:00:00,4,2012-07-02,...,0fb2dab415369349676dc3d3e3fba003,1,11249,2011-06-17 01:47:10,m,28,,,479803a1b36cd8eb80e586b5b0bceba3,0.037525


In [35]:
test_df.sort_index(by='predict')[-10:]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Unnamed: 0_x,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,...,COUPON_ID_hash,cross,Unnamed: 0_y,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,PREF_NAME,USER_ID_hash,predict
6971633,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,18241,2011-05-23 13:15:30,f,47,,Tokyo,a3442aab65892f1818a5afe309aea9ac,0.114026
6974208,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,20816,2010-11-11 12:57:20,m,38,,Tokyo,549b6bcbd5f28374a1e9a156caa3ce8c,0.114026
6965112,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,11720,2011-06-12 11:28:12,m,65,,Tokyo,d42f23905177ff3cc4f91b6dabdd209b,0.114026
6965111,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,11719,2011-12-03 18:40:40,m,55,,Tokyo,e805ac9cc11c9c6130c8c7228ed84d73,0.114026
6974210,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,20818,2012-05-17 13:09:13,m,54,,Tokyo,585d09629720fb89bec20b3a33d264b2,0.114026
6974211,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,20819,2011-08-30 22:31:55,m,25,,Tokyo,c83701e5a75d26380a70d677602fabff,0.114026
6955177,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,1785,2011-04-18 17:01:47,m,42,,Tokyo,a40489eccddb8d41440939d58339e472,0.114026
6959531,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,6139,2011-12-04 21:25:45,f,30,,Tokyo,e9ac30d5da540db64d199146dc6d3dd3,0.114026
6959528,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,6136,2010-10-23 19:08:40,m,42,,Tokyo,413246b8fdbcda8873650c0416ba80c8,0.114026
6964304,304,Other,Other coupon,95,2000,100,2012-06-28 12:00:00,2012-07-03 12:00:00,5,2012-07-05,...,c988d799bc7db9254fe865ee6cf2d4ff,1,10912,2010-12-13 09:48:34,m,25,,Tokyo,90db60b8fd8d51642a56d0d8d2848acd,0.114026
