# 1. Sampling Data

random หยิบข้อมูลมาป้อนให้ model โดยสิ่งที่จะ random ประกอบด้วย 
- Test time : เวลาล่าสุดที่เรามีข้อมูล (ข้อมูลที่อยู่หลังจากเวลานี้จะแยกเก็บไว้ตรวจความแม่นของ model) > ได้ Order ที่ CREATE_DATE < เวลาที่สุ่มได้
- User : สุ่ม User แค่ครั้งละ N คน
- Map user-order กับ item
- สร้าง pivot table 

In [73]:
import pandas as pd
import datetime
import random
import numpy as np
from random import randint

from sklearn.metrics.pairwise import cosine_similarity

order = pd.read_csv('../../../2_data/explored/order.csv', parse_dates=['CREATE_DATE','UPDATE_DATE'])
item = pd.read_csv('../../../2_data/explored/order_item.csv')

In [74]:
def sampling_sb_data(order, item, test_time, sample_number):
    # Original Data Set     
    print('ORIGINAL ORDERS :',len(order))
    
    # Order < Test Time
    test_day = max(order['CREATE_DATE']) - datetime.timedelta(days=test_time)
    new_order = order[order['CREATE_DATE'] < test_day]
    answer = order[order['CREATE_DATE'] >= test_day]
    print('NEW ORDERS :',len(new_order))
    print('NEW ANSWER :',len(answer))
    
    # Sample Customer & Order
    customer_id = new_order['CUSTOMER_ID'].unique()
    print('UNIQUE CUSTOMERS :',len(customer_id))
    sample_customer_id = np.unique(random.sample(population=list(customer_id), k=sample_number))
    print('SAMPLE CUSTOMERS :',len(sample_customer_id))
    sample_order = new_order[new_order['CUSTOMER_ID'].isin(list(sample_customer_id))]
    sample_answer = answer[answer['CUSTOMER_ID'].isin(list(sample_customer_id))] 
    print('SAMPLE ORDERS :',len(sample_order))
    print('SAMPLE ANSWER :',len(sample_answer))
    
    # Merge order-item
    order_item = pd.merge(sample_order, item, on='ORDER_ID')
    answer_item = pd.merge(sample_answer, item, on='ORDER_ID')
    print('SAMPLE ORDER ITEM :',len(order_item))
    print('SAMPLE ANSWER ITEM :',len(answer_item))
    
    new_order_item = order_item.copy()
    new_answer_item = answer_item.copy()
    del new_order_item['ORDER_ID']
    del new_order_item['BPCODE']
    del new_order_item['CREATE_DATE']
    del new_order_item['UPDATE_DATE']
    del new_order_item['STATUS']
    del new_order_item['ORDER_ITEM_ID']
    del new_order_item['TOTAL_PRICE']
    
    del new_answer_item['ORDER_ID']
    del new_answer_item['BPCODE']
    del new_answer_item['CREATE_DATE']
    del new_answer_item['UPDATE_DATE']
    del new_answer_item['STATUS']
    del new_answer_item['ORDER_ITEM_ID']
    del new_answer_item['TOTAL_PRICE']
    
    grouped = new_order_item.groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    grouped2 = new_answer_item.groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    grouped['QUANTITY'] = 1
    grouped2['QUANTITY'] = 1
    
    # Pivot table
    pivot_table = grouped.pivot(index='CUSTOMER_ID', columns='MATNR', values='QUANTITY')
    print('PIVOT SHAPE :',pivot_table.shape)
    
    return [pivot_table,grouped2]

In [75]:
def sampling_sb_data_clean(order, item, test_time, sample_number):    
    # Order < Test Time
    test_day = max(order['CREATE_DATE']) - datetime.timedelta(days=test_time)
    new_order = order[order['CREATE_DATE'] < test_day]
    answer = order[order['CREATE_DATE'] >= test_day]
    
    # Sample Customer & Order
    customer_id = new_order['CUSTOMER_ID'].unique()
    sample_customer_id = np.unique(random.sample(population=list(customer_id), k=sample_number))
    sample_order = new_order[new_order['CUSTOMER_ID'].isin(list(sample_customer_id))]
    sample_answer = answer[answer['CUSTOMER_ID'].isin(list(sample_customer_id))] 
    
    # Merge order-item
    order_item = pd.merge(sample_order, item, on='ORDER_ID')
    answer_item = pd.merge(sample_answer, item, on='ORDER_ID')
    
    new_order_item = order_item.copy()
    new_answer_item = answer_item.copy()
    del new_order_item['ORDER_ID']
    del new_order_item['BPCODE']
    del new_order_item['CREATE_DATE']
    del new_order_item['UPDATE_DATE']
    del new_order_item['STATUS']
    del new_order_item['ORDER_ITEM_ID']
    del new_order_item['TOTAL_PRICE']
    
    del new_answer_item['ORDER_ID']
    del new_answer_item['BPCODE']
    del new_answer_item['CREATE_DATE']
    del new_answer_item['UPDATE_DATE']
    del new_answer_item['STATUS']
    del new_answer_item['ORDER_ITEM_ID']
    del new_answer_item['TOTAL_PRICE']
    
    grouped = new_order_item.groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    grouped2 = new_answer_item.groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    grouped['QUANTITY'] = 1
    grouped2['QUANTITY'] = 1
    
    # Pivot table
    pivot_table = grouped.pivot(index='CUSTOMER_ID', columns='MATNR', values='QUANTITY')
    
    return [pivot_table,grouped2]

In [76]:
[sample_pivot_table,answer] = sampling_sb_data(order,item,180,100)

ORIGINAL ORDERS : 11916
NEW ORDERS : 8004
NEW ANSWER : 3912
UNIQUE CUSTOMERS : 4558
SAMPLE CUSTOMERS : 100
SAMPLE ORDERS : 177
SAMPLE ANSWER : 10
SAMPLE ORDER ITEM : 272
SAMPLE ANSWER ITEM : 10
PIVOT SHAPE : (100, 201)


# 2. Create Model From Sampling

สร้าง model จาก Sampling แบบข้อมูลชุดเดียว

In [77]:
def cosine_similarity_between_item(pivot_table_df):
    cv = pivot_table_df.copy().values

    # filter NaN value to 0   
    cv[np.isnan(cv)] = 0

    cv = cosine_similarity(cv.T)        
    column_name_list = list(pivot_table_df)
    cv_df = pd.DataFrame(cv, index=column_name_list, columns=column_name_list)
    return cv_df

def prefiltering_of_neighbors(old_similar_table, thershold):
    new_similar_table = old_similar_table.copy()
    # Negative filtering & Thershold filtering
    new_similar_table[new_similar_table < thershold] = 0
    return new_similar_table

def cs_classification_predicted_score(pivot_table,weight,output_set):
    # initial variable
    score_table = pivot_table.copy()
    number_of_item = score_table.shape[1]
    
    # get NaN index
    nan_index = np.argwhere(np.isnan(score_table).values)
    
    # for loop to predict each NaN and fill value to it       
    for nan_pos in nan_index:
        r_score = np.array([])
        u,i = nan_pos[0],nan_pos[1]
        ru = score_table.values[u,:]
        wi = weight.values[i]
        wi[i] = 0
        
        for r in output_set:
            r_array,delta_r = np.array([r]*number_of_item),np.array([0]*number_of_item)
            delta_r[r_array == ru]  = 1
            vir = sum(wi*delta_r)
            r_score = np.append(r_score, vir)
        
        score_table.values[u][i] = output_set[np.argmax(r_score)]
        
    return score_table

In [78]:
similarity_weight = cosine_similarity_between_item(sample_pivot_table)
prefiltering_sw = prefiltering_of_neighbors(similarity_weight, 0.1)
predicted_table = cs_classification_predicted_score(sample_pivot_table,prefiltering_sw,[0,1])

In [79]:
predicted_table.head()

MATNR,19008833,19015840,19023002,19023492,19026316,19030280,19036407,19036595,19036598,19036616,...,59012408,59012432,59012678,59012763,59012841,59013028,59013067,59013828,59013829,59014423
CUSTOMER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7322,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Evaluate Model

สร้าง item(order) list ของ users แต่ละคนจากข้อมูลที่ตัดแบ่งไว้ตอนแรกก่อน

In [80]:
[sample_pivot_table,answer] = sampling_sb_data(order,item,180,100)
similarity_weight = cosine_similarity_between_item(sample_pivot_table)
prefiltering_sw = prefiltering_of_neighbors(similarity_weight, 0.1)
predicted_table = cs_classification_predicted_score(sample_pivot_table,prefiltering_sw,[0,1])

ORIGINAL ORDERS : 11916
NEW ORDERS : 8004
NEW ANSWER : 3912
UNIQUE CUSTOMERS : 4558
SAMPLE CUSTOMERS : 100
SAMPLE ORDERS : 199
SAMPLE ANSWER : 11
SAMPLE ORDER ITEM : 343
SAMPLE ANSWER ITEM : 12
PIVOT SHAPE : (100, 239)


หา True Positive

In [81]:
def true_positive(predicted_pivot, answer):
    predicted_table = pd.DataFrame(predicted_pivot.unstack()).groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    predicted_item = predicted_table[predicted_table[0]==1]
    upper = len(pd.merge(answer, predicted_item, how='inner', on=['CUSTOMER_ID','MATNR']))
    return upper

In [82]:
true_positive(predicted_table,answer)

0

หา False Positive

In [83]:
def false_positive(predicted_pivot, answer):
    predicted_table = pd.DataFrame(predicted_pivot.unstack()).groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    predicted_item = predicted_table[predicted_table[0]==1]
    upper = pd.merge(predicted_item, answer, how='left', on=['CUSTOMER_ID','MATNR'])
    return sum(np.isnan(upper['QUANTITY']))

In [102]:
false_positive(predicted_table,answer)

558

หา False Negative

In [108]:
def false_negative(predicted_pivot, answer):
    predicted_table = pd.DataFrame(predicted_pivot.unstack()).groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    predicted_item = predicted_table[predicted_table[0]==1]
    upper = pd.merge(answer, predicted_item, how='left', on=['CUSTOMER_ID','MATNR'])
    return sum(np.isnan(upper[0]))

In [109]:
false_negative(predicted_table,answer)

10

ได้ $$ \text{Precision} = \frac{\text{tp}}{\text{tp} + \text{fp}} $$

In [85]:
def precision(predicted_pivot,answer):
    pcs = true_positive(predicted_pivot,answer)/(true_positive(predicted_pivot,answer) + false_positive(predicted_pivot,answer))
    return pcs

In [110]:
precision(predicted_table,answer)

0.0

In [111]:
def recall(predicted_pivot,answer):
    pcs = true_positive(predicted_pivot,answer)/(true_positive(predicted_pivot,answer) + false_negative(predicted_pivot,answer))
    return pcs

In [112]:
recall(predicted_table,answer)

0.0

พิจารณาช่วงวันระหว่างมากสุดถึงน้อยสุด ต้องการสุ่มให้มีหัว-หางเหลืออย่างน้อย 100 ตัว ดังนั้นจะได้ 100 < test_days < 737

In [94]:
(max(order['CREATE_DATE']) - min(order['CREATE_DATE'])).days

837

In [96]:
len(order['CUSTOMER_ID'].unique())

6539

ส่วน CUSTOMER มี 6539 กะว่าเอา 200-300 คน

ทำ iteration สัก 10 ครั้งเพื่อพิจารณา precission

In [115]:
precision_array = np.array([])
recall_array = np.array([])

for i in range(10):
    [sample_pivot_table1,answer1] = sampling_sb_data_clean(order,item,randint(200, 300),randint(200, 300))
    similarity_weight1 = cosine_similarity_between_item(sample_pivot_table1)
    prefiltering_sw1 = prefiltering_of_neighbors(similarity_weight1, 0.1)
    predicted_table1 = cs_classification_predicted_score(sample_pivot_table1,prefiltering_sw1,[0,1])
    precision_array = np.append(precision_array,precision(predicted_table1,answer1))
    recall_array = np.append(recall_array,recall(predicted_table1,answer1))
    print(i)

print(precision_array)

0
1
2
3
4
5
6
7
8
9
[ 0.00120773  0.00147493  0.00162734  0.0012848   0.00384986  0.
  0.00191571  0.          0.00139227  0.        ]


In [116]:
precision_array

array([ 0.00120773,  0.00147493,  0.00162734,  0.0012848 ,  0.00384986,
        0.        ,  0.00191571,  0.        ,  0.00139227,  0.        ])

In [117]:
recall_array

array([ 0.02941176,  0.02083333,  0.08      ,  0.03846154,  0.0952381 ,
        0.        ,  0.04411765,  0.        ,  0.05479452,  0.        ])

In [119]:
F1score = 2*precision_array*recall_array/(precision_array+recall_array)
F1score

  """Entry point for launching an IPython kernel.


array([ 0.00232019,  0.00275482,  0.00318979,  0.00248653,  0.00740056,
               nan,  0.00367197,         nan,  0.00271555,         nan])

In [120]:
np.average(precision_array)

0.0012752628924566204

In [121]:
np.average(recall_array)

0.036285689934561813

In [122]:
np.nanmean(F1score)

0.0035056289533448907