In [1]:
from tqdm import tqdm

In [2]:
from imblearn.combine import *
from imblearn.over_sampling import *
from imblearn.under_sampling import *

In [3]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import datetime as dt

# import matplotlib as mpl
# mpl.use('Agg')
# import matplotlib.pylab as plt
# from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()
''
import warnings

warnings.filterwarnings('ignore')

In [82]:
# train 
detail_train = pd.read_csv('./coupon_data_project2/coupon_detail_train_translated_en.csv', parse_dates=['I_DATE'])
visit_train = pd.read_csv('./coupon_data_project2/coupon_visit_train.csv', parse_dates=['I_DATE'])

area_train = pd.read_csv('./coupon_data_project2/coupon_area_train_translated_en.csv')
coupon_list_train = pd.read_csv('./coupon_data_project2/coupon_list_train_translated_en.csv', parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND'])

# base data
location = pd.read_csv('./coupon_data_project2/train_location.csv')
user_list = pd.read_csv('./coupon_data_project2/user_list_translated_en.csv', parse_dates=['WITHDRAW_DATE', 'REG_DATE'])

# test data
area_test = pd.read_csv('./coupon_data_project2/test_location.csv')
coupon_list_test = pd.read_csv('./coupon_data_project2/coupon_list_test_translated_en.csv', parse_dates=['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND'])

# submisiion
submission = pd.read_csv('./coupon_data_project2/sample_submission.csv')

In [5]:
# 테이블 현황 파악을 위한 함수 
def summary_table(table):
    df = pd.DataFrame()
    for i in table.columns:
        name = i
        dtype = table[i].dtype.name
        null = table[i].isnull().sum()
        act = table.shape[0] - null
        unique = len(table[i].unique())
        data = {'name': name, 'dtype': dtype, 'null': null, 'act': act, 'unique': unique}
        df = df.append(data, ignore_index=True)
    return df

## detail 만 detail_train으로 저장하기

## coupon list

In [83]:
coupon_list = pd.merge(coupon_list_test, coupon_list_train, how='outer')

In [84]:
coupon_list_test.shape, coupon_list_train.shape, coupon_list.shape

((310, 24), (19413, 24), (19723, 24))

In [85]:
# 캡슐과 장르 통합 및 명칭 변경
coupon_list['Case'] = coupon_list['CAPSULE_TEXT'] + coupon_list['GENRE_NAME']
coupon_list['Case'] = coupon_list['Case'].apply(lambda x: "HOTEL" if x == 'Guest houseHotel and Japanese hotel' 
                          or x == 'HotelHotel and Japanese hotel'
                          or x == 'Japanese hotelHotel and Japanese hotel'
                          or x == 'Japanse guest houseHotel and Japanese hotel'
                          or x == 'LodgeHotel and Japanese hotel'
                          or x == 'Public hotelHotel and Japanese hotel'
                          or x == 'Resort innHotel and Japanese hotel'
                          or x == 'Vacation rentalHotel and Japanese hotel'
                          else "NAIL" if x == 'Nail and eye salonNail and eye salon'
                          else "HAIR" if x == 'Hair salonHair salon'
                          else "FOOD" if x == 'FoodFood'
                          else "SPA" if x == 'SpaSpa'
                          else "BEAUTY" if x == 'BeautyBeauty'
                          else "CLASS" if x == 'ClassLesson'
                          else "CORRESPONDENCE" if x == 'Correspondence courseLessonClassLesson'
                          else "DELIVERY" if x == 'Delivery serviceDelivery service'
                          else "EVENT" if x == 'EventOther coupon'
                          else "GIFT" if x == 'Gift cardGift card'
                          else "HEALTH" if x == 'Health and medicalHealth and medical'
                          else "LEISURE" if x == 'LeisureLeisure'
                          else "LESSON" if x == 'LessonLesson'
                          else "OTHER" if x == 'OtherOther coupon'
                          else "RELAXATION" if x == 'RelaxationRelaxation'
                          else "WEB" if x == 'Web serviceOther coupon'
                          else 'OTHER'
                          )

In [86]:
# 실판매가 게산
coupon_list['Price'] = coupon_list['CATALOG_PRICE'] - coupon_list['DISCOUNT_PRICE']

In [87]:
# 실판매가 정규화
coupon_list["lnDPRICE"] = np.log1p(coupon_list["Price"])
coupon_list["mDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.mean)
coupon_list["sDPRICE"] = coupon_list.groupby("Case")["lnDPRICE"].transform(np.std)
coupon_list["zprice"] = (coupon_list["lnDPRICE"] - coupon_list["mDPRICE"]) / coupon_list["sDPRICE"]

In [88]:
# 지역명 변경
coupon_list.rename(columns = {"LARGE_AREA_NAME": "spot_large", 
                              "ken_name": "spot_pref", 
                              "SMALL_AREA_NAME": "spot_small"}, inplace=True)

In [89]:
# usable: nan -> 1, 2 -> 0
coupon_list['USABLE_DATE_MON'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_TUE'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_WED'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_THU'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_FRI'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SAT'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_SUN'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)
coupon_list['USABLE_DATE_BEFORE_HOLIDAY'].replace([0,2,1,np.nan],[0,0,1,1], inplace=True)

In [90]:
coupon_list.columns

Index(['CAPSULE_TEXT', 'GENRE_NAME', 'PRICE_RATE', 'CATALOG_PRICE',
       'DISCOUNT_PRICE', 'DISPFROM', 'DISPEND', 'DISPPERIOD', 'VALIDFROM',
       'VALIDEND', 'VALIDPERIOD', 'USABLE_DATE_MON', 'USABLE_DATE_TUE',
       'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI',
       'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY',
       'USABLE_DATE_BEFORE_HOLIDAY', 'spot_large', 'spot_pref', 'spot_small',
       'COUPON_ID_hash', 'Case', 'Price', 'lnDPRICE', 'mDPRICE', 'sDPRICE',
       'zprice'],
      dtype='object')

In [91]:
coupon_list['VALIDPERIOD'] = coupon_list['VALIDPERIOD'].fillna(179)

In [None]:
# coupon_list.drop(labels = ['CAPSULE_TEXT'], axis=1, inplace=True )
# coupon_list.drop(labels = ['sDPRICE'], axis=1, inplace=True )

In [None]:
# coupon_list.drop(labels = ['CAPSULE_TEXT'], axis=1, inplace=True )
# coupon_list.drop(labels = ['GENRE_NAME'], axis=1, inplace=True )
# coupon_list.drop(labels = ['CATALOG_PRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['DISCOUNT_PRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['DISPFROM'], axis=1, inplace=True )
# coupon_list.drop(labels = ['DISPEND'], axis=1, inplace=True )
# coupon_list.drop(labels = ['VALIDFROM'], axis=1, inplace=True )
# coupon_list.drop(labels = ['VALIDEND'], axis=1, inplace=True )
# coupon_list.drop(labels = ['lnDPRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['mDPRICE'], axis=1, inplace=True )
# coupon_list.drop(labels = ['Price'], axis=1, inplace=True )
# coupon_list.drop(labels = ['spot_pref'], axis=1, inplace=True )  # 판단이슈 
# coupon_list.drop(labels = ['spot_small'], axis=1, inplace=True ) # 판단이슈 
# coupon_list.drop(labels = ['spot_large'], axis=1, inplace=True ) # 판단이슈 
# coupon_list.drop(labels = ['large_area_name'], axis=1, inplace=True ) # 판단이슈 

In [92]:
summary_table(coupon_list)

Unnamed: 0,act,dtype,name,null,unique
0,19723.0,object,CAPSULE_TEXT,0.0,24.0
1,19723.0,object,GENRE_NAME,0.0,13.0
2,19723.0,int64,PRICE_RATE,0.0,71.0
3,19723.0,int64,CATALOG_PRICE,0.0,2435.0
4,19723.0,int64,DISCOUNT_PRICE,0.0,1118.0
5,19723.0,datetime64[ns],DISPFROM,0.0,386.0
6,19723.0,datetime64[ns],DISPEND,0.0,381.0
7,19723.0,int64,DISPPERIOD,0.0,18.0
8,13480.0,datetime64[ns],VALIDFROM,6243.0,388.0
9,13480.0,datetime64[ns],VALIDEND,6243.0,521.0


In [17]:
coupon_list.columns

Index(['CAPSULE_TEXT', 'GENRE_NAME', 'PRICE_RATE', 'CATALOG_PRICE',
       'DISCOUNT_PRICE', 'DISPFROM', 'DISPEND', 'DISPPERIOD', 'VALIDFROM',
       'VALIDEND', 'VALIDPERIOD', 'USABLE_DATE_MON', 'USABLE_DATE_TUE',
       'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI',
       'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY',
       'USABLE_DATE_BEFORE_HOLIDAY', 'spot_large', 'spot_pref', 'spot_small',
       'COUPON_ID_hash', 'Price', 'lnDPRICE', 'mDPRICE', 'sDPRICE', 'zprice'],
      dtype='object')

In [19]:
summary_table(coupon_list)

Unnamed: 0,act,dtype,name,null,unique
0,19723.0,object,CAPSULE_TEXT,0.0,24.0
1,19723.0,object,GENRE_NAME,0.0,13.0
2,19723.0,int64,PRICE_RATE,0.0,71.0
3,19723.0,int64,CATALOG_PRICE,0.0,2435.0
4,19723.0,int64,DISCOUNT_PRICE,0.0,1118.0
5,19723.0,datetime64[ns],DISPFROM,0.0,386.0
6,19723.0,datetime64[ns],DISPEND,0.0,381.0
7,19723.0,int64,DISPPERIOD,0.0,18.0
8,13480.0,datetime64[ns],VALIDFROM,6243.0,388.0
9,13480.0,datetime64[ns],VALIDEND,6243.0,521.0


In [93]:
# dummy 처리 할 column 명 넣어줌
ls_dummy = ['Case','spot_pref','spot_large','spot_small']

In [94]:
coupon_list = pd.get_dummies(coupon_list, columns = ls_dummy)

In [95]:
# train set과 test set을 다시 분리
coupon_list_train = coupon_list[311:]
coupon_list_test = coupon_list[:310]

# detail

In [96]:
detail_train = pd.read_csv('./coupon_data_project2/coupon_detail_train_translated_en.csv', parse_dates=['I_DATE'])


In [97]:
detail_train.rename(columns = {'I_DATE': 'purchase_date'}, inplace=True)
detail_train.rename(columns = {'SMALL_AREA_NAME': 'resid_small'}, inplace=True)

In [98]:
detail_train.drop(labels = ['ITEM_COUNT'], axis=1, inplace=True)
detail_train.drop(labels = ['PURCHASEID_hash'], axis=1, inplace=True)
detail_train.drop(labels = ['resid_small'], axis=1, inplace=True)
detail_train.drop(labels = ['purchase_date'], axis=1, inplace=True)

In [99]:
detail_train.head()

Unnamed: 0,USER_ID_hash,COUPON_ID_hash
0,d9dca3cb44bab12ba313eaa681f663eb,34c48f84026e08355dc3bd19b427f09a
1,560574a339f1b25e57b0221e486907ed,767673b7a777854a92b73b0934ddfae7
2,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2
3,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2
4,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2


In [100]:
detail_train = pd.merge(detail_train, coupon_list, how='left', on='COUPON_ID_hash')
detail_train = detail_train.merge(user_list, on = 'USER_ID_hash', how = 'outer')

In [101]:
len(detail_train["USER_ID_hash"].unique())

22873

In [31]:
len(np.unique(coupon_list_train.COUPON_ID_hash.values))

19412

## 필요없는 변수 지우기

In [102]:
list(detail_train.columns)

['USER_ID_hash',
 'COUPON_ID_hash',
 'CAPSULE_TEXT',
 'GENRE_NAME',
 'PRICE_RATE',
 'CATALOG_PRICE',
 'DISCOUNT_PRICE',
 'DISPFROM',
 'DISPEND',
 'DISPPERIOD',
 'VALIDFROM',
 'VALIDEND',
 'VALIDPERIOD',
 'USABLE_DATE_MON',
 'USABLE_DATE_TUE',
 'USABLE_DATE_WED',
 'USABLE_DATE_THU',
 'USABLE_DATE_FRI',
 'USABLE_DATE_SAT',
 'USABLE_DATE_SUN',
 'USABLE_DATE_HOLIDAY',
 'USABLE_DATE_BEFORE_HOLIDAY',
 'Price',
 'lnDPRICE',
 'mDPRICE',
 'sDPRICE',
 'zprice',
 'Case_BEAUTY',
 'Case_CLASS',
 'Case_DELIVERY',
 'Case_EVENT',
 'Case_FOOD',
 'Case_GIFT',
 'Case_HAIR',
 'Case_HEALTH',
 'Case_HOTEL',
 'Case_LEISURE',
 'Case_LESSON',
 'Case_NAIL',
 'Case_OTHER',
 'Case_RELAXATION',
 'Case_SPA',
 'Case_WEB',
 'spot_pref_Aichi',
 'spot_pref_Akita',
 'spot_pref_Aomori',
 'spot_pref_Chiba',
 'spot_pref_Ehime',
 'spot_pref_Fukui',
 'spot_pref_Fukuoka',
 'spot_pref_Fukushima',
 'spot_pref_Gifu',
 'spot_pref_Gunma',
 'spot_pref_Hiroshima',
 'spot_pref_Hokkaido',
 'spot_pref_Hyogo',
 'spot_pref_Ibaraki',
 'sp

In [103]:
del_ls = ['CAPSULE_TEXT','GENRE_NAME','CATALOG_PRICE',
 'DISCOUNT_PRICE', 'DISPFROM',
 'DISPEND', 'VALIDFROM',
 'VALIDEND','Price',
 'lnDPRICE',
 'mDPRICE',
 'sDPRICE','REG_DATE',
 'SEX_ID',
 'AGE',
 'WITHDRAW_DATE',
 'PREF_NAME']
train = detail_train.drop(del_ls, 1)

In [104]:
summary_table(train)

Unnamed: 0,act,dtype,name,null,unique
0,169087.0,object,USER_ID_hash,0.0,22873.0
1,168996.0,object,COUPON_ID_hash,91.0,19369.0
2,168996.0,float64,PRICE_RATE,91.0,72.0
3,168996.0,float64,DISPPERIOD,91.0,19.0
4,168996.0,float64,VALIDPERIOD,91.0,181.0
5,168996.0,float64,USABLE_DATE_MON,91.0,3.0
6,168996.0,float64,USABLE_DATE_TUE,91.0,3.0
7,168996.0,float64,USABLE_DATE_WED,91.0,3.0
8,168996.0,float64,USABLE_DATE_THU,91.0,3.0
9,168996.0,float64,USABLE_DATE_FRI,91.0,3.0


### test 생성

In [105]:
test = coupon_list_test.filter(train.columns)

In [106]:
compare_not_test = [i for i in train.columns if i not in test.columns]
compare_not_train = [i for i in test.columns if i not in train.columns]
print('only_train: {}  \n'.format(compare_not_test))
print('only_test: {}'.format(compare_not_train))

only_train: ['USER_ID_hash']  

only_test: []


## 유사도 구하기

In [107]:
train_drop_c = train.drop('COUPON_ID_hash', axis=1)
user = train_drop_c.groupby(by='USER_ID_hash').apply(np.mean)

In [108]:
test_features = test.drop('COUPON_ID_hash', axis=1)
test_features.columns

Index(['PRICE_RATE', 'DISPPERIOD', 'VALIDPERIOD', 'USABLE_DATE_MON',
       'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU',
       'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN',
       ...
       'spot_small_Tochigi', 'spot_small_Tokushima', 'spot_small_Tottori',
       'spot_small_Toyama', 'spot_small_Triple', 'spot_small_Wakayama',
       'spot_small_Yamagata', 'spot_small_Yamaguchi', 'spot_small_Yamanashi',
       'spot_small_Yokohama'],
      dtype='object', length=140)

In [109]:
list(test_features.columns)

['PRICE_RATE',
 'DISPPERIOD',
 'VALIDPERIOD',
 'USABLE_DATE_MON',
 'USABLE_DATE_TUE',
 'USABLE_DATE_WED',
 'USABLE_DATE_THU',
 'USABLE_DATE_FRI',
 'USABLE_DATE_SAT',
 'USABLE_DATE_SUN',
 'USABLE_DATE_HOLIDAY',
 'USABLE_DATE_BEFORE_HOLIDAY',
 'zprice',
 'Case_BEAUTY',
 'Case_CLASS',
 'Case_DELIVERY',
 'Case_EVENT',
 'Case_FOOD',
 'Case_GIFT',
 'Case_HAIR',
 'Case_HEALTH',
 'Case_HOTEL',
 'Case_LEISURE',
 'Case_LESSON',
 'Case_NAIL',
 'Case_OTHER',
 'Case_RELAXATION',
 'Case_SPA',
 'Case_WEB',
 'spot_pref_Aichi',
 'spot_pref_Akita',
 'spot_pref_Aomori',
 'spot_pref_Chiba',
 'spot_pref_Ehime',
 'spot_pref_Fukui',
 'spot_pref_Fukuoka',
 'spot_pref_Fukushima',
 'spot_pref_Gifu',
 'spot_pref_Gunma',
 'spot_pref_Hiroshima',
 'spot_pref_Hokkaido',
 'spot_pref_Hyogo',
 'spot_pref_Ibaraki',
 'spot_pref_Ishikawa',
 'spot_pref_Iwate',
 'spot_pref_Kagawa',
 'spot_pref_Kagoshima',
 'spot_pref_Kanagawa',
 'spot_pref_Kochi',
 'spot_pref_Kumamoto',
 'spot_pref_Kyoto',
 'spot_pref_Mie',
 'spot_pref_Miya

In [39]:
from sklearn.preprocessing import *

In [40]:
user.shape, test_features.shape

((22873, 148), (310, 148))

In [113]:
user = user.fillna(0)

In [114]:
user.isnull().sum() , \
test_features.isnull().sum()

(PRICE_RATE                                          0
 DISPPERIOD                                          0
 VALIDPERIOD                                         0
 USABLE_DATE_MON                                     0
 USABLE_DATE_TUE                                     0
 USABLE_DATE_WED                                     0
 USABLE_DATE_THU                                     0
 USABLE_DATE_FRI                                     0
 USABLE_DATE_SAT                                     0
 USABLE_DATE_SUN                                     0
 USABLE_DATE_HOLIDAY                                 0
 USABLE_DATE_BEFORE_HOLIDAY                          0
 zprice                                              0
 Case_BEAUTY                                         0
 Case_CLASS                                          0
 Case_DELIVERY                                       0
 Case_EVENT                                          0
 Case_FOOD                                           0
 Case_GIFT

In [119]:
del_ls = ['VALIDPERIOD',
 'USABLE_DATE_MON',
 'USABLE_DATE_TUE',
 'USABLE_DATE_WED',
 'USABLE_DATE_THU',
 'USABLE_DATE_FRI',
 'USABLE_DATE_SAT',
 'USABLE_DATE_SUN',
         ]

vector_1 = vector_1.drop(del_ls, 1)
vector_2 = vector_2.drop(del_ls, 1)

In [118]:
vector_1 = user.copy()
vector_2 = test_features.copy()

vector_1 = scale(vector_1)
vector_1 = pd.DataFrame(data = vector_1, index = user.index, columns = user.columns )

vector_2 = scale(vector_2)
vector_2 = pd.DataFrame(data = vector_2, index = test_features.index, columns = test_features.columns )

In [126]:
list(vector_1.columns)

['PRICE_RATE',
 'DISPPERIOD',
 'USABLE_DATE_HOLIDAY',
 'USABLE_DATE_BEFORE_HOLIDAY',
 'zprice',
 'Case_BEAUTY',
 'Case_CLASS',
 'Case_DELIVERY',
 'Case_EVENT',
 'Case_FOOD',
 'Case_GIFT',
 'Case_HAIR',
 'Case_HEALTH',
 'Case_HOTEL',
 'Case_LEISURE',
 'Case_LESSON',
 'Case_NAIL',
 'Case_OTHER',
 'Case_RELAXATION',
 'Case_SPA',
 'Case_WEB',
 'spot_pref_Aichi',
 'spot_pref_Akita',
 'spot_pref_Aomori',
 'spot_pref_Chiba',
 'spot_pref_Ehime',
 'spot_pref_Fukui',
 'spot_pref_Fukuoka',
 'spot_pref_Fukushima',
 'spot_pref_Gifu',
 'spot_pref_Gunma',
 'spot_pref_Hiroshima',
 'spot_pref_Hokkaido',
 'spot_pref_Hyogo',
 'spot_pref_Ibaraki',
 'spot_pref_Ishikawa',
 'spot_pref_Iwate',
 'spot_pref_Kagawa',
 'spot_pref_Kagoshima',
 'spot_pref_Kanagawa',
 'spot_pref_Kochi',
 'spot_pref_Kumamoto',
 'spot_pref_Kyoto',
 'spot_pref_Mie',
 'spot_pref_Miyagi',
 'spot_pref_Miyazaki',
 'spot_pref_Nagano',
 'spot_pref_Nagasaki',
 'spot_pref_Nara',
 'spot_pref_Niigata',
 'spot_pref_Oita',
 'spot_pref_Okayama',
 '

In [45]:
import math

def Cosine(vec1, vec2) :
    result = InnerProduct(vec1,vec2) / (VectorSize(vec1) * VectorSize(vec2))
    return result

def VectorSize(vec) :
    return math.sqrt(sum(math.pow(v,2) for v in vec))

def InnerProduct(vec1, vec2) :
    return sum(v1*v2 for v1,v2 in zip(vec1,vec2))

def Euclidean(vec1, vec2) :
    return math.sqrt(sum(math.pow((v1-v2),2) for v1,v2 in zip(vec1, vec2)))

def Theta(vec1, vec2) :
    return math.acos(Cosine(vec1,vec2)) + 10

def Triangle(vec1, vec2) :
    theta = math.radians(Theta(vec1,vec2))
    return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2

def Magnitude_Difference(vec1, vec2) :
    return abs(VectorSize(vec1) - VectorSize(vec2))

def Sector(vec1, vec2) :
    ED = Euclidean(vec1, vec2)
    MD = Magnitude_Difference(vec1, vec2)
    theta = Theta(vec1, vec2)
    return math.pi * math.pow((ED+MD),2) * theta/360

def TS_SS(vec1, vec2) :
    return Triangle(vec1, vec2) * Sector(vec1, vec2)


In [121]:
w = [1.2,1.4,0.8,0.8,1.5,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.8,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]

In [123]:
for i in range(len(w)):
    vector_1[vector_1.columns[i]] = w[i]*vector_1[vector_1.columns[i]]

for i in range(len(w)):
    vector_2[vector_2.columns[i]] = w[i]*vector_2[vector_2.columns[i]]

In [124]:
from scipy import spatial
def cosine_similarity(vector_1, vector_2):

    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [125]:
matrix = []
for idx1 in tqdm(range(len(user))):
    
    row = []
    for idx2 in range(len(test_features)):
        row.append(cosine_similarity(vector_1.iloc[idx1], vector_2.iloc[idx2]))
    matrix.append(row)

100%|██████████| 22873/22873 [45:34<00:00,  8.36it/s]


In [127]:
coupons_ids = test['COUPON_ID_hash']
result_index = vector_1.index
result_columns = [coupons_ids[i] for i in range(0, 310)]
result_df = pd.DataFrame(index= result_index, columns=result_columns,data=matrix)

In [128]:
submission = pd.DataFrame(columns=['USER_ID_hash','PURCHASED_COUPONS'])
submission['USER_ID_hash'] = result_df.index
submission['PURCHASED_COUPONS'] = result_df.T.apply(lambda x: " ".join(x.sort_values(ascending=False)[:11].index)).values

In [129]:
submission.to_csv('cosine4.csv',index = False,header=True)


In [133]:
matrix[:5]

[[0.9457029169630761,
  0.2540783635002217,
  0.18840298385298737,
  0.22886988745074766,
  0.1299644939605359,
  0.2195417084905228,
  0.24771435842974265,
  0.13078274693711212,
  -0.013813470992398935,
  0.22274797138557456,
  0.28245966716408366,
  0.06670230072389594,
  0.1854694204952334,
  0.4500959302836465,
  0.3095544960357566,
  0.04734505326120109,
  0.9361124607687874,
  -0.04888234525553514,
  -0.060141772390845505,
  0.040997832940965595,
  0.17708595711070907,
  -0.11095710583786755,
  -0.20200145527604008,
  -0.3546571148029454,
  -0.11596678148021922,
  0.027748804569613483,
  0.10505116958093652,
  -0.07784286471609558,
  -0.12656009671469426,
  0.3401514296454162,
  -0.11856229874000501,
  -0.04372817061370249,
  0.17969960999241008,
  -0.06801670496138001,
  -0.17075509764982377,
  -0.17116418623473528,
  -0.06612245461383548,
  0.0010720627212871259,
  0.3377667049177411,
  0.5978212854228967,
  -0.3536279023907265,
  0.028534713319642746,
  -0.048840305892636104,