In [1]:
import Collobrative_Filtering as CF
import pandas as pd
import numpy as np
import inspect
import os
import pickle
from scipy import sparse 
from sklearn.metrics import pairwise

In [2]:
# Load Data
file_path = inspect.getfile(inspect.currentframe())
file_direction = os.path.dirname(os.path.abspath(file_path))
offertagging = os.path.join(file_direction, 'OFFER_LOG_WITH_COUNT.xlsx')
usertagging = os.path.join(file_direction, 'MJ_TAG.xlsx')
offers = os.path.join(file_direction, 'OfferTag_1120.xlsx')
offertagging = pd.read_excel(offertagging)
usertagging = pd.read_excel(usertagging)
offers = pd.read_excel(offers)

In [3]:
offer_avgclick = offers[['OFFER','AvgClick','CampaignName']]
offer_avgclick = offer_avgclick[~offer_avgclick.AvgClick.isnull()]

In [4]:
offertagging.head()

Unnamed: 0,ID,OFFER_ID,COUNT
0,1463,OFF0023,28
1,1218,OFF0022,18
2,1218,OFF0025,18
3,559,OFF0023,17
4,900,OFF0023,15


In [5]:
offer_avgclick.head()

Unnamed: 0,OFFER,AvgClick,CampaignName
0,OFF0001,1.26,夜間換匯
1,OFF0002,1.58,臺幣活存轉外幣定存
3,OFF0004,1.25,擁有外幣帳戶三大理由
4,OFF0005,1.36,連假攻略
5,OFF0006,1.29,戰勝通膨


In [6]:
offer_std = offertagging.groupby(['OFFER_ID'])['COUNT'].std(ddof=1).reset_index(name='StdClick').fillna(0)

In [7]:
offer_std.head()

Unnamed: 0,OFFER_ID,StdClick
0,OFF0001,0.0
1,OFF0002,0.421637
2,OFF0004,0.516398
3,OFF0005,0.446496
4,OFF0006,0.0


In [8]:
offer_info = offer_std.merge(offer_avgclick, left_on='OFFER_ID', right_on='OFFER',how='inner')[['OFFER_ID','StdClick','AvgClick','CampaignName']]

In [9]:
offer_info.loc[offer_info['StdClick']==0,['StdClick']]= 0.000001

In [10]:
offer_info.head()

Unnamed: 0,OFFER_ID,StdClick,AvgClick,CampaignName
0,OFF0001,1e-06,1.26,夜間換匯
1,OFF0002,0.421637,1.58,臺幣活存轉外幣定存
2,OFF0004,0.516398,1.25,擁有外幣帳戶三大理由
3,OFF0005,0.446496,1.36,連假攻略
4,OFF0006,1e-06,1.29,戰勝通膨


In [11]:
offertagging = offertagging.merge(offer_info, on= 'OFFER_ID', how='left')

In [12]:
offertagging.head()

Unnamed: 0,ID,OFFER_ID,COUNT,StdClick,AvgClick,CampaignName
0,1463,OFF0023,28,5.091008,4.41,日圓走勢
1,1218,OFF0022,18,3.271379,3.43,人民幣走勢
2,1218,OFF0025,18,3.566223,3.41,澳幣走勢
3,559,OFF0023,17,5.091008,4.41,日圓走勢
4,900,OFF0023,15,5.091008,4.41,日圓走勢


In [13]:
offertagging['StandarizeClick'] = offertagging.apply(lambda row : (row['COUNT'] - row['AvgClick'])/row['StdClick'], axis=1)

In [14]:
offertagging.head()

Unnamed: 0,ID,OFFER_ID,COUNT,StdClick,AvgClick,CampaignName,StandarizeClick
0,1463,OFF0023,28,5.091008,4.41,日圓走勢,4.63366
1,1218,OFF0022,18,3.271379,3.43,人民幣走勢,4.453779
2,1218,OFF0025,18,3.566223,3.41,澳幣走勢,4.091163
3,559,OFF0023,17,5.091008,4.41,日圓走勢,2.472987
4,900,OFF0023,15,5.091008,4.41,日圓走勢,2.080138


In [15]:
offertagging = offertagging.pivot_table(index=['ID'], columns='OFFER_ID', values='COUNT').fillna(0)

In [16]:
Train = offertagging[~offertagging.index.isin(offertagging.index[:10])]
Test = offertagging[offertagging.index.isin(offertagging.index[:10])]

In [17]:
Test.index

Int64Index([9, 10, 38, 48, 57, 71, 72, 78, 86, 90], dtype='int64', name=u'ID')

In [18]:
Test.values

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,

In [19]:
def sort_output(data):
    sort_data = sorted(range(len(data)), key=lambda x: data[x], reverse=True)
    return sort_data

In [20]:
Preference_Dist = Test.values

In [22]:
with open('RealDist.pickle', 'wb') as f:
    pickle.dump(Preference_Dist, f, pickle.HIGHEST_PROTOCOL)

In [None]:
for i in range(Preference_Dist.shape[0]):
    for j in range(Preference_Dist[i].shape[0]):
        if Preference_Dist[i][j] >= 2:
            Preference_Dist[i][j] = 0.995
        elif Preference_Dist[i][j] >= 1:
            Preference_Dist[i][j] = 0.975
        elif Preference_Dist[i][j] >= 0:
            Preference_Dist[i][j] = 0.8
        elif Preference_Dist[i][j] >= -1:
            Preference_Dist[i][j] = 0.2
        elif Preference_Dist[i][j] >= -2:
            Preference_Dist[i][j] = 0.025
        else:
            Preference_Dist[i][j] = 0.005

In [None]:
Preference_Dist

In [None]:
with open('Preference.pickle', 'wb') as f:
    pickle.dump(Preference_Dist, f, pickle.HIGHEST_PROTOCOL)

In [22]:
file_path = inspect.getfile(inspect.currentframe())
file_direction = os.path.dirname(os.path.abspath(file_path))
offertagging = os.path.join(file_direction, 'Train_Data.csv')
offertagging = pd.read_csv(offertagging)

In [23]:
offertagging = offertagging[~offertagging.ID.isin([9, 10, 38, 48, 57, 71, 72, 78, 86, 90])]

In [None]:
offertagging.to_csv('Train_Data.csv',index=False)

In [None]:
with open('offer_maptb.pickle', 'rb') as f5:
    offer_maptb = pickle.load(f5)

In [None]:
IndexMapOffer = {}
for i in range(len(offer_maptb)):
    IndexMapOffer[offer_maptb[i]] = i

In [None]:
with open('IndexMapOffer.pickle', 'wb') as f1:
    pickle.dump(IndexMapOffer, f1, pickle.HIGHEST_PROTOCOL)