In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import datetime as dt

import warnings

warnings.filterwarnings('ignore')

In [2]:
# 테이블 현황 파악을 위한 함수 
def summary_table(table):
    df = pd.DataFrame()
    for i in table.columns:
        name = i
        dtype = table[i].dtype.name
        null = table[i].isnull().sum()
        act = table.shape[0] - null
        unique = len(table[i].unique())
        data = {'name': name, 'dtype': dtype, 'null': null, 'act': act, 'unique': unique}
        df = df.append(data, ignore_index=True)
    return df

### 데이터 불러오기: 일본어는 영어로 번역, 지역에 Prefecture가 붙은 지명은 Prefecture 제외

In [3]:
# train 
detail_train = pd.read_csv('coupon_data_project2/coupon_detail_train_translated_en.csv', 
                           parse_dates=['I_DATE'])  # 쿠폰 구매 내역

visit_train = pd.read_csv('coupon_data_project2/coupon_visit_train.csv',
                          parse_dates=['I_DATE']) # 쿠폰 조회 내역

area_train = pd.read_csv('coupon_data_project2/coupon_area_train_translated_en.csv') # 쿠폰 사용 가능 지역
coupon_list_train = pd.read_csv('coupon_data_project2/coupon_list_train_translated_en.csv', 
                                parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND']) # 쿠폰 리스트(train)

# base data
location = pd.read_csv('coupon_data_project2/train_location.csv') # 지역 위치 정보(위도/경도)
user_list = pd.read_csv('coupon_data_project2/user_list_translated_en.csv', 
                        parse_dates=['WITHDRAW_DATE', 'REG_DATE']) # user 정보

# test data
area_test = pd.read_csv('coupon_data_project2/test_location.csv') # 쿠폰 사용 가능 지역
coupon_list_test = pd.read_csv('coupon_data_project2/coupon_list_test_translated_en.csv',
                               parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND']) # 쿠폰 리스트

# submisiion
submission = pd.read_csv('coupon_data_project2/sample_submission.csv') # 제출 자료

--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------
## A. Preprocessing
--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------


### A-1. detail_train
--------------------------------------------------------------------------------------------------------

In [4]:
summary_table(detail_train).pivot_table(index = ['dtype', 'name'])

Unnamed: 0_level_0,Unnamed: 1_level_0,act,null,unique
dtype,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
datetime64[ns],I_DATE,168996.0,0.0,130309.0
int64,ITEM_COUNT,168996.0,0.0,32.0
object,COUPON_ID_hash,168996.0,0.0,19368.0
object,PURCHASEID_hash,168996.0,0.0,168996.0
object,SMALL_AREA_NAME,168996.0,0.0,55.0
object,USER_ID_hash,168996.0,0.0,22782.0


#### 1) 신규 columns 생성

1-1) merge 후 구매 구분을 위한 PURCHASE_FLG

In [None]:
detail_train['PURCHASE_FLG'] = 1

#### 2) column명 변경

2-1) I_DATE -> purchase_date: merge 후 구매일자 구분을 위함

2-2) SMALL_AREA_NAME: coupon list의 지역(판매 spot)과 구분하기 위함

In [None]:
detail_train.rename(columns = {'I_DATE': 'purchase_date'}, inplace=True)
detail_train.rename(columns = {'SMALL_AREA_NAME': 'resid_small'}, inplace=True)

#### 3) drop: ITEM_COUNT는 활용여부 판단후 처리

In [None]:
detail_train.drop(labels = ['ITEM_COUNT'], axis=1, inplace=True)
detail_train.drop(labels = ['PURCHASEID_hash'], axis=1, inplace=True)
detail_train.drop(labels = ['resid_small'], axis=1, inplace=True)
# detail_train.drop(labels = ['purchase_date'], axis=1, inplace=True)

### A-2. visit_train
--------------------------------------------------------------------------------------------------------

In [None]:
summary_table(visit_train).pivot_table(index = ['dtype', 'name'])

#### 1) 신규 column 생성

In [None]:
visit_train['VIEW'] = 1

#### 2) column명 변경

In [None]:
visit_train.rename(columns = {'I_DATE': 'VIEW_DATE'}, inplace=True)
visit_train.rename(columns = {'VIEW_COUPON_ID_hash': 'COUPON_ID_hash'}, inplace=True)

#### 3) drop

In [None]:
visit_train.drop(labels = ['PAGE_SERIAL'], axis=1, inplace=True) # PAGE_SERIAL에 따라 의미가 있으나 test set에 반영 불가
visit_train.drop(labels = ['REFERRER_hash'], axis=1, inplace=True) 
visit_train.drop(labels = ['SESSION_ID_hash'], axis=1, inplace=True) 
visit_train.drop(labels = ['PURCHASEID_hash'], axis=1, inplace=True)

In [None]:
visit_train[:2]

### A-3. Coupon_list
--------------------------------------------------------------------------------------------------------

In [None]:
summary_table(coupon_list_train).pivot_table(index = ['dtype', 'name'])

#### 1) 전처리 일관성을 유지를 위한 coupon_list merge(311~ train임)

In [None]:
coupon_list = pd.merge(coupon_list_test, coupon_list_train, how='outer')

In [None]:
coupon_list_test.shape, coupon_list_train.shape, coupon_list.shape

#### 2) 신규 columns 생성

In [None]:
# 캡슐과 장르 통합 및 명칭 변경
coupon_list['Case'] = coupon_list['CAPSULE_TEXT'] + coupon_list['GENRE_NAME']
coupon_list['Case'] = coupon_list['Case'].apply(lambda x: "HOTEL" if x == 'Guest houseHotel and Japanese hotel' 
                          or x == 'HotelHotel and Japanese hotel'
                          or x == 'Japanese hotelHotel and Japanese hotel'
                          or x == 'Japanse guest houseHotel and Japanese hotel'
                          or x == 'LodgeHotel and Japanese hotel'
                          or x == 'Public hotelHotel and Japanese hotel'
                          or x == 'Resort innHotel and Japanese hotel'
                          or x == 'Vacation rentalHotel and Japanese hotel'
                          else "NAIL" if x == 'Nail and eye salonNail and eye salon'
                          else "HAIR" if x == 'Hair salonHair salon'
                          else "FOOD" if x == 'FoodFood'  # FOOD
                          else "SPA" if x == 'SpaSpa'  # SPA
                          else "BEAUTY" if x == 'BeautyBeauty'
                          else "CLASS" if x == 'ClassLesson'
                          else "CORRESPONDENCE" if x == 'Correspondence courseLessonClassLesson'
                          else "DELIVERY" if x == 'Delivery serviceDelivery service'
                          else "EVENT" if x == 'EventOther coupon'
                          else "GIFT" if x == 'Gift cardGift card'
                          else "HEALTH" if x == 'Health and medicalHealth and medical'
                          else "LEISURE" if x == 'LeisureLeisure'  # LEISURE
                          else "LESSON" if x == 'LessonLesson'
                          else "OTHER" if x == 'OtherOther coupon'
                          else "RELAXATION" if x == 'RelaxationRelaxation'
                          else "WEB" if x == 'Web serviceOther coupon'
                          else 'OTHER'
                          )

In [None]:
# 실판매가 게산
coupon_list['Price'] = coupon_list['CATALOG_PRICE'] + coupon_list['DISCOUNT_PRICE']

In [None]:
# 실판매가 정규화
coupon_list["lnDPRICE"] = np.log1p(coupon_list["Price"])
coupon_list["mDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.mean)
coupon_list["sDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.std)
coupon_list["zprice"] = (coupon_list["lnDPRICE"] - coupon_list["mDPRICE"]) / coupon_list["sDPRICE"]

#### 3) column명 변경

In [None]:
# 지역명 변경
coupon_list.rename(columns = {"LARGE_AREA_NAME": "spot_large", 
                              "ken_name": "spot_pref", 
                              "SMALL_AREA_NAME": "spot_small"}, inplace=True)

#### 4) Null 값 및 오류 처리

In [None]:
# usable: nan -> 1, 2 -> 0
coupon_list['USABLE_DATE_MON'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_TUE'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_WED'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_THU'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_FRI'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SAT'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SUN'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_BEFORE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)

#### 5) drop

In [None]:
# coupon_list.drop(labels = ['CAPSULE_TEXT'], axis=1, inplace=True )
# coupon_list.drop(labels = ['GENRE_NAME'], axis=1, inplace=True )
coupon_list.drop(labels = ['CATALOG_PRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['DISCOUNT_PRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['DISPFROM'], axis=1, inplace=True )
coupon_list.drop(labels = ['DISPEND'], axis=1, inplace=True )
coupon_list.drop(labels = ['VALIDFROM'], axis=1, inplace=True )
coupon_list.drop(labels = ['VALIDEND'], axis=1, inplace=True )
coupon_list.drop(labels = ['lnDPRICE'], axis=1, inplace=True )
coupon_list.drop(labels = ['mDPRICE'], axis=1, inplace=True )
coupon_list.drop(labels = ['sDPRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['Price'], axis=1, inplace=True )
# coupon_list.drop(labels = ['spot_pref'], axis=1, inplace=True )  # 판단이슈 
coupon_list.drop(labels = ['spot_small'], axis=1, inplace=True ) # 판단이슈 
coupon_list.drop(labels = ['spot_large'], axis=1, inplace=True ) # 판단이슈 

#### 6) train & test set 분리

In [None]:
# train set과 test set을 다시 분리
coupon_list_train = coupon_list[311:]
coupon_list_test = coupon_list[:310]

### A4. User_list
--------------------------------------------------------------------------------------------------------

In [None]:
summary_table(user_list).pivot_table(index = ['dtype', 'name'])

#### 1) column명 변경

In [None]:
user_list.rename(columns = {'PREF_NAME': 'user_pref'}, inplace=True)

#### 2) SEX_ID 0,1 로 변경(f: 0, m: 1)

In [None]:
# SEX_ID: f는 0으로 M은 1로
user_list['SEX_ID'] = user_list['SEX_ID'].apply(lambda x: 0 if x == 'f' else 1) 

#### 3) drop

In [None]:
# REG_DATE , WITHDRAW_DATE 삭제
user_list.drop(labels = ['REG_DATE'], axis=1, inplace=True)
user_list.drop(labels = ['WITHDRAW_DATE'], axis=1, inplace=True)
# user_list.drop(labels = ['user_pref'], axis=1, inplace=True)  # 판단 이슈


In [None]:
user_list[:2]

### A5. train set 구성
--------------------------------------------------------------------------------------------------------

#### 1) visit_train & detail_train -> train

In [None]:
train= pd.merge(visit_train, detail_train, how='outer')

In [None]:
train.shape

#### 2) train & coupon_list

In [None]:
train = pd.merge(train, coupon_list, how='left', on='COUPON_ID_hash')

In [None]:
train.shape 

#### 3) train & user_list

In [None]:
train = pd.merge(train, user_list, how='left', on='USER_ID_hash')

In [None]:
train.shape

#### 4) train & location

In [None]:
train['key'] = train['COUPON_ID_hash']+ train['USER_ID_hash']

In [None]:
location['key'] = location['COUPON_ID_hash'] + location['USER_ID_hash'] 
location.drop_duplicates(['key'], inplace=True)

In [None]:
train.insert(2, 'distance', train['key'].map(location.set_index('key')['distance']))
train.insert(2, 'PREF_in', train['key'].map(location.set_index('key')['PREF_in']))

In [None]:
train.drop(labels=['key'], axis=1, inplace=True)

In [None]:
train[:2]

In [None]:
aa = train[['PREF_in','distance','user_pref','spot_pref']]

In [None]:
aa[~aa['distance'].isnull()][aa['spot_pref'].isnull()]

#### 4) train 현황 점검 및 NaN값 처리

4-1) null 이 315,301개인 것들은 기초정보(coupon_list(test 포함)에 없는 것들이므로 제외 -> zprice 기준으로 처리

4-2) VALIDPERIOD(null: 773,492)은 무제한이라는 의미에서 10,000으로 처리

4-3) user_pref(null: 488,972) 을 NN 으로 처리

In [None]:
# null 이 315,301개인 것들은 기초정보(coupon_list(test 포함)에 없는 것들이므로 제외 -> zprice 기준으로 처리
train = train[train['zprice'] >= -100000]

In [None]:
train['VALIDPERIOD'] = train['VALIDPERIOD'].fillna(180)
train['user_pref'] = train['user_pref'].fillna('NN')
train['VIEW'] = train['VIEW'].fillna(0)

In [None]:
train['distance'] = train['distance'].fillna(train['distance'].mean())

In [None]:
summary_table(train).sort_values(by='unique', ascending = True)

## EDA

In [None]:
# test 데이터 생성 -> EDA 과정에서 추가되는 편수를 같이 적용해주기 위해 test 를 먼저 불러왔음
coupon_list_test['A'] = 1
user_list['A'] = 1
test = pd.merge(coupon_list_test, user_list, how='outer')

##### user_pref & spot_pref가 같은 경우 view 대비 buy 확률이 높다

In [None]:
df = train.copy()

In [None]:
df = df.pivot_table(index = ['Case', 'user_pref', 'spot_pref'], values = ['VIEW', 'PURCHASE_FLG'], aggfunc='sum').reset_index()

In [None]:
df['same_area'] = np.where(df['user_pref'] == df['spot_pref'], 1, 0)

In [None]:
df = df.pivot_table(index = 'same_area', values = ['VIEW', 'PURCHASE_FLG'], aggfunc=np.sum)

In [None]:
df['%buy'] = df['PURCHASE_FLG'] / df['VIEW'] * 100
df

In [None]:
def same_area(user, spot):
    if user == spot : return 1
    else: return 0

In [None]:
# train set에 same_area 추가
train['same_area'] = list(map(same_area, train.user_pref, train.spot_pref))


In [None]:
# test set에 same_arae 추가
test['same_area'] = list(map(same_area, test.user_pref, test.spot_pref))

In [None]:
list(df.columns)[:50]

In [None]:
df[['PREF_in','distance','user_pref','spot_pref']][[df['distance']==237.740409].index]

In [None]:
table_pref_in = pd.pivot_table(df, values='USER_ID_hash', index=['PREF_in','PURCHASE_FLG'],aggfunc=np.size)
table_pref_in.rename(columns={'PURCHASE_FLG', 'p'})

In [None]:
table_pref_in.rename()

#### case별 판매현황

In [None]:
df= train.copy()

In [None]:
df_Case = df.pivot_table(index = ['USER_ID_hash'], values = ['PURCHASE_FLG'],
                         columns = ['Case'], aggfunc = np.mean)

In [None]:
df_Case
df_Case.columns = [c[1] if c[1] else c[0] for c in df_Case.columns.tolist()]

In [None]:
df_Case = df_Case.reset_index()
df_Case.fillna(0, inplace=True)
df_Case[:2]  

In [None]:
df_pref = df.pivot_table(index = ['USER_ID_hash'], values = ['PURCHASE_FLG'],
                         columns = ['user_pref'], aggfunc = np.mean)

In [None]:
df_pref.columns = [c[1] if c[1] else c[0] for c in df_pref.columns.tolist()]

In [None]:
df_pref = df_pref.reset_index()
df_pref.fillna(0, inplace=True)
df_pref[:2]

In [None]:
train.shape

In [None]:
train = pd.merge(train, df_Case, how='left', on='USER_ID_hash')
train = pd.merge(train, df_pref, how='left', on ='USER_ID_hash')

test = pd.merge(test, df_Case, how='left', on='USER_ID_hash')
test = pd.merge(test, df_pref, how='left', on='USER_ID_hash')

# 여기 이후로는 하지 않기

In [None]:
df = train.copy()

In [None]:
df['Test'] = df['VIEW_DATE'] - df['DISPFROM']

In [None]:
df['Test'] = df['Test'].astype('timedelta64[s]')/360

In [None]:
df['Test'] = df['Test'].apply(pd.to_numeric, errors='coerce')


In [None]:
df_pref_disp = df.pivot_table(index = ['USER_ID_hash'], values = ['Test'],
                         columns = ['user_pref'], aggfunc = np.mean)
df_pref_disp.columns = [c[1] if c[1] else c[0] for c in df_pref_disp.columns.tolist()]
df_pref_disp = df_pref_disp.reset_index()
df_pref_disp.fillna(0, inplace=True)

In [None]:
df_Case_disp = df.pivot_table(index = ['USER_ID_hash'], values = ['Test'],
                         columns = ['Case'], aggfunc = np.mean)
df_Case_disp.columns = [c[1] if c[1] else c[0] for c in df_Case_disp.columns.tolist()]
df_Case_disp = df_Case_disp.reset_index()
df_Case_disp.fillna(0, inplace=True)

In [None]:
train = pd.merge(train, df_Case_disp, how='left', on='USER_ID_hash')
train = pd.merge(train, df_pref_disp, how='left', on ='USER_ID_hash')

test = pd.merge(test, df_Case_disp, how='left', on='USER_ID_hash')
test = pd.merge(test, df_pref_disp, how='left', on='USER_ID_hash')

In [None]:
df = train.copy()
df_pref_view = df.pivot_table(index = ['USER_ID_hash'], values = ['VIEW'],
                             columns= ['user_pref'], aggfunc = np.mean) 
df_pref_view.columns = [c[1] if c[1] else c[0] for c in df_pref_view.columns.tolist()]
df_pref_view = df_pref_view.reset_index()
df_pref_view.fillna(0, inplace=True)

In [None]:
df_Case_view = df.pivot_table(index = ['USER_ID_hash'], values = ['VIEW'],
                             columns= ['Case'], aggfunc = np.mean) 
df_Case_view.columns = [c[1] if c[1] else c[0] for c in df_Case_view.columns.tolist()]
df_Case_view = df_Case_view.reset_index()
df_Case_view.fillna(0, inplace=True)

In [None]:
train = pd.merge(train, df_Case_view, how='left', on='USER_ID_hash')
train = pd.merge(train, df_pref_view, how='left', on ='USER_ID_hash')

test = pd.merge(test, df_Case_view, how='left', on='USER_ID_hash')
test = pd.merge(test, df_pref_view, how='left', on='USER_ID_hash')

--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------
## B. 모델링
--------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------

### B1. 데이터 생성
--------------------------------------------------------------------------------------------------------

#### 1) dummy list 생성(train & test의 일관성을 위함)

In [None]:
# ls_dummy = ['Case']
# ls_dummy = ['user_pref', 'spot_small', 'spot_pref', 'spot_large', 'Case']

#### 2) train data dummy 처리

In [None]:
# train = pd.get_dummies(train, columns = ls_dummy)

#### 3) test data 생성 -> 아래 로케이션 부분은 슬랙에서설명한 부분 참조

In [None]:
test

In [None]:
area_test = pd.read_csv('coupon_data_project2/test_location3.csv') # 쿠폰 사용 가능 지역

In [None]:
# area_test 삽입
test['key'] = test['COUPON_ID_hash'] + test['USER_ID_hash'] 
area_test['key'] = area_test['COUPON_ID_hash'] + area_test['USER_ID_hash'] 
test.insert(2, 'distance', test['key'].map(area_test.set_index('key')['distance']))
test.insert(2, 'PREF_in', test['key'].map(area_test.set_index('key')['PREF_in']))
test.drop(labels = ['key'], axis=1, inplace=True)

In [None]:
zz = test[['distance', 'PREF_in','spot_pref', 'user_pref']]

In [None]:
zz.distance.isnull().sum() / len(zz)

In [None]:
zz[zz['user_pref']==zz['spot_pref']]['distance'].sum()

#### 4) test data dummy처리

In [None]:
# test = pd.get_dummies(test, columns = ls_dummy)
test.drop(labels = ['A'], axis=1, inplace=True)

#### 5)  test & train set columns 비교 -> 지역이 문제임. 지역은 개인 판단하에 위에 drop부분에서 삭제해주시길

5-1) PURCHASE_FLG: train의 y값으로 활용될 것임

5-2) VIEW_DATE: 향후 활용 가능성이 있음. 우선은 mod_ls에서 걸러짐.

5-3) VIEW: 향후 활용 가능성 있음(가중치 넣는 식). 우선은 mod_ls에서 걸러짐

In [None]:
compare_not_test = [i for i in train.columns if i not in test.columns]
compare_not_train = [i for i in test.columns if i not in train.columns]
print('only_train: {}  \n'.format(compare_not_test))
print('only_test: {}'.format(compare_not_train))

### B2. train data set
--------------------------------------------------------------------------------------------------------

In [None]:
to_be_removed_train = {'PURCHASE_FLG', 'USER_ID_hash','COUPON_ID_hash', "VIEW", 'purchase_date', 'spot_pref', 
                       'user_pref', 'VIEW_DATE', 'DISPFROM', 'GENRE_NAME', 'CAPSULE_TEXT', 'Case'}
ls_train = [i for i in list(train.columns) if i not in to_be_removed_train]
X_train = train.filter(ls_train)
y_train = train.PURCHASE_FLG

In [None]:
X_train[:2]

### B3. 모델링: xgboost
--------------------------------------------------------------------------------------------------------

In [None]:
# from sklearn import clone
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
import xgboost

#### 1) train 에 활용할 컬럼 선정(test 컬럼과 일치시킴)

In [None]:
mod_ls = [i for i in X_train.columns if i in test.columns]
X_train = train.filter(mod_ls)
y_train = train.PURCHASE_FLG

#### 2) parameter 지정(parameter는 우수사례 벤치마킹, 논리 및 개선여부 검토해봐야함)

In [None]:
model_xgb = xgboost.XGBClassifier(n_estimators=300, max_depth=3,
                                 objective = 'reg:logistic',
                                 subsample= 0.85,
                                 colsample_bytree=0.8,
                                 random_state=12345,
                                 min_child_weight=1,
                                 learning_rate=0.05)

In [None]:
model_xgb = model_xgb.fit(X_train, y_train)

### B5. Predict

#### 1) predict 후 sum을통해 1이 몇개인지 확인 -> 할때마다 0이 나옴 .. 아래 확률로 접근해야 함

In [None]:
# predict
y_pred_xgb = model_xgb.predict(test.filter(mod_ls))
y_pred_xgb.sum()

#### 2) proba를 생성해서 test set과 merge

In [None]:
y_hat_proba = model_xgb.predict_proba(test.filter(mod_ls))
df_y_hat_proba= pd.DataFrame(y_hat_proba, columns=['n','y'])

In [None]:
test_xgb = test.filter(['USER_ID_hash', 'COUPON_ID_hash'])

In [None]:
result_proba_df = pd.concat([test_xgb, pd.DataFrame(df_y_hat_proba)],1)

#### 3) 기준을 잡기 위해 확률의 평균을 확인

In [None]:
result_proba_df.y.mean()

#### 4) 모델 개선 및 현황 파악을 위한 feature importance 점검

In [None]:
importances = model_xgb.feature_importances_
df_imp = pd.DataFrame()
for i, j in zip(mod_ls, list(importances)):
    data = {'columns': i, 'importance': np.round(j*100,1)} 
    df_imp = df_imp.append(data, ignore_index=True)

df_imp.sort_values(by='importance', ascending=False)

#### 5) 확률을 선정 -> 3번의 기준으로 어림잡아 선정 -> 최종 제출시에는 각 유저별 상위 10개로 지정하는게 좋겠음

In [None]:
# result_proba_df2 = result_proba_df[result_proba_df['y']>0.08].sort_values(by='y', ascending=False)
result_proba_df2 = result_proba_df.sort_values(by=['USER_ID_hash','y'], ascending=False)

#### 6) 제출양식에 맞춰 lookup_table을 형성

In [None]:
lookup_table = result_proba_df2.groupby('USER_ID_hash').apply(lambda x: list(x.COUPON_ID_hash)[:10])
lookup_table = pd.DataFrame(lookup_table, columns = ['COUPON_ID_hash']).reset_index()
lookup_table

lookup_table.rename(columns={'COUPON_ID_hash':'PURCHASED_COUPONS'},inplace=True)

#### 7) 매칭 및 양식에 맞춘 마무리 작업

In [None]:
submission.insert(2, 'COUPON', submission['USER_ID_hash'].map(lookup_table.set_index('USER_ID_hash')['PURCHASED_COUPONS']))

In [None]:
submission.drop('PURCHASED_COUPONS', axis=1, inplace=True)
submission.rename(columns={'COUPON':'PURCHASED_COUPONS'}, inplace=True)

submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].astype('str')
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace('[',''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace(']',''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace("'",''))
submission['PURCHASED_COUPONS'] = submission['PURCHASED_COUPONS'].apply(lambda x: x.replace(',',''))
# submission.drop(labels=['index'], axis=1, inplace=True)

### B7. 검증(그래프 같은것들??)

### B8. submission

In [None]:
submission.to_csv('test_submission_xgb.csv', index=False)

In [None]:
submission


In [None]:
from sklearn.metrics import classification_report



In [None]:
classification_report(y_test, y_pred_xgb)

In [114]:

from scipy import spatial
def cosine_similarity(vector_1, vector_2):

    return 1 - spatial.distance.cosine(vector_1, vector_2)

vector_1 = pd.DataFrame(np.array([[1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10],
                                 [1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10],
                                 [1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10],[1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10],
                                 [1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10],
                                 [1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10]]))
vector_2 = pd.DataFrame(np.array([[45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34],
                                 [45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34],
                                 [45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34],[45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34],
                                 [45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34],
                                 [45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34]]))


In [124]:
len(vector_1)

12

In [123]:
%%time

matrix = []
for idx1 in range(len(vector_1)):
    row = []
    for idx2 in range(len(vector_2)):
        row.append(cosine_similarity(vector_1.iloc[idx1], vector_2.iloc[idx2]))
    matrix.append(row)

CPU times: user 75.6 ms, sys: 1.65 ms, total: 77.3 ms
Wall time: 77.2 ms


In [126]:
144 * 0.02 / 36

0.08

In [128]:
7090630 * 22782 * 0.02 / 36 / 60 / 60

24928.81676851852

In [102]:
%%time

matrix = []
for idx1 in range(len(vector_1)):
    row = []
    for idx2 in range(len(vector_2)):
        row.append(TS_SS(vector_1.iloc[idx1], vector_2.iloc[idx2]))
    matrix.append(row)

CPU times: user 38 ms, sys: 1.76 ms, total: 39.8 ms
Wall time: 38.8 ms


In [19]:
vec1 = np.array([[1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5,1,2,3,4,5],
                     [6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10,6,7,8,9,10]]).flatten()
vec2 = np.array([[45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44,45,23,56,12,44],
                     [34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34,34,34,12,45,34]]).flatten()

import math

def Cosine(vec1, vec2) :
    result = InnerProduct(vec1,vec2) / (VectorSize(vec1) * VectorSize(vec2))
    return result

def VectorSize(vec) :
    return math.sqrt(sum(math.pow(v,2) for v in vec))

def InnerProduct(vec1, vec2) :
    return sum(v1*v2 for v1,v2 in zip(vec1,vec2))

def Euclidean(vec1, vec2) :
    return math.sqrt(sum(math.pow((v1-v2),2) for v1,v2 in zip(vec1, vec2)))

def Theta(vec1, vec2) :
    return math.acos(Cosine(vec1,vec2)) + 10

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2

def Magnitude_Difference(vec1, vec2) :
    return abs(VectorSize(vec1) - VectorSize(vec2))

def Sector(vec1, vec2) :
    ED = Euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

def TS_SS(vec1, vec2) :
    return Triangle(vec1, vec2) * Sector(vec1, vec2)


print(Euclidean(vec1,vec2))
print(Cosine(vec1,vec2))
print(TS_SS(vec1,vec2))



TS_SS(vec1, vec2)

247.04655431719746
0.7966772313651609
27261231.098073907


27261231.098073907