# 1. Sampling Data

random หยิบข้อมูลมาป้อนให้ model โดยสิ่งที่จะ random ประกอบด้วย 
- Test time : เวลาล่าสุดที่เรามีข้อมูล (ข้อมูลที่อยู่หลังจากเวลานี้จะแยกเก็บไว้ตรวจความแม่นของ model) > ได้ Order ที่ CREATE_DATE < เวลาที่สุ่มได้
- User : สุ่ม User แค่ครั้งละ N คน
- Map user-order กับ item
- สร้าง pivot table 

In [190]:
import pandas as pd
import datetime
import random
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

order = pd.read_csv('../../../2_data/explored/order.csv', parse_dates=['CREATE_DATE','UPDATE_DATE'])
item = pd.read_csv('../../../2_data/explored/order_item.csv')

In [168]:
def sampling_sb_data(order, item, test_time, sample_number):
    # Original Data Set     
    print('ORIGINAL ORDERS :',len(order))
    
    # Order < Test Time
    test_day = max(order['CREATE_DATE']) - datetime.timedelta(days=test_time)
    new_order = order[order['CREATE_DATE'] < test_day]
    print('NEW ORDERS :',len(new_order))
    
    # Sample Customer & Order
    customer_id = new_order['CUSTOMER_ID'].unique()
    print('UNIQUE CUSTOMERS :',len(customer_id))
    sample_customer_id = np.unique(random.sample(population=list(customer_id), k=sample_number))
    print('SAMPLE CUSTOMERS :',len(sample_customer_id))
    sample_order = new_order[new_order['CUSTOMER_ID'].isin(list(sample_customer_id))]
    print('SAMPLE ORDERS :',len(sample_order))
    
    # Merge order-item
    order_item = pd.merge(sample_order, item, on='ORDER_ID')
    print('SAMPLE ITEM :',len(order_item))
    
    new_order_item = order_item.copy()
    del new_order_item['ORDER_ID']
    del new_order_item['BPCODE']
    del new_order_item['CREATE_DATE']
    del new_order_item['UPDATE_DATE']
    del new_order_item['STATUS']
    del new_order_item['ORDER_ITEM_ID']
    del new_order_item['TOTAL_PRICE']
    
    grouped = new_order_item.groupby(['CUSTOMER_ID','MATNR']).sum().reset_index()
    grouped['QUANTITY'] = 1
    
    # Pivot table
    pivot_table = grouped.pivot(index='CUSTOMER_ID', columns='MATNR', values='QUANTITY')
    print('PIVOT SHAPE :',pivot_table.shape)
    
    return pivot_table

In [186]:
sample_pivot_table = sampling_sb_data(order,item,180,100)

ORIGINAL ORDERS : 11916
NEW ORDERS : 9548
UNIQUE CUSTOMERS : 5412
SAMPLE CUSTOMERS : 100
SAMPLE ORDERS : 170
SAMPLE ITEM : 252
PIVOT SHAPE : (100, 191)


# 2. Create Model From Sampling

In [185]:
(max(order['CREATE_DATE']) - min(order['CREATE_DATE'])).days

837

สร้าง model จาก Sampling แบบข้อมูลชุดเดียว

In [193]:
def cosine_similarity_between_item(pivot_table_df):
    cv = pivot_table_df.copy().values

    # filter NaN value to 0   
    cv[np.isnan(cv)] = 0

    cv = cosine_similarity(cv.T)        
    column_name_list = list(pivot_table_df)
    cv_df = pd.DataFrame(cv, index=column_name_list, columns=column_name_list)
    return cv_df

def prefiltering_of_neighbors(old_similar_table, thershold):
    new_similar_table = old_similar_table.copy()
    # Negative filtering & Thershold filtering
    new_similar_table[new_similar_table < thershold] = 0
    return new_similar_table

def cs_classification_predicted_score(pivot_table,weight,output_set):
    # initial variable
    score_table = pivot_table.copy()
    number_of_item = score_table.shape[1]
    
    # get NaN index
    nan_index = np.argwhere(np.isnan(score_table).values)
    
    # for loop to predict each NaN and fill value to it       
    for nan_pos in nan_index:
        r_score = np.array([])
        u,i = nan_pos[0],nan_pos[1]
        ru = score_table.values[u,:]
        wi = weight.values[i]
        wi[i] = 0
        
        for r in output_set:
            r_array,delta_r = np.array([r]*number_of_item),np.array([0]*number_of_item)
            delta_r[r_array == ru]  = 1
            vir = sum(wi*delta_r)
            r_score = np.append(r_score, vir)
        
        score_table.values[u][i] = output_set[np.argmax(r_score)]
        
    return score_table

In [205]:
similarity_weight = cosine_similarity_between_item(sample_pivot_table)
prefiltering_sw = prefiltering_of_neighbors(similarity_weight, 0.1)
predicted_table = cs_classification_predicted_score(sample_pivot_table,prefiltering_sw,[0,1])

In [207]:
predicted_table.head()

MATNR,19014293,19015920,19026021,19029462,19035150,19036583,19036607,19036622,19041156,19041255,...,59008494,59010260,59010273,59010274,59011465,59011466,59011674,59012062,59013828,59014281
CUSTOMER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Evaluate Model