# 필요 모듈 import & 데이터 zip 파일 압축 풀기

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile # zip 파일 풀기

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import os
files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        files.append(filename.split('.zip')[0])
        with zipfile.ZipFile(os.path.join(dirname, filename),"r") as z:
            z.extractall()

# 데이터 불러오기

In [2]:
files

['departments.csv',
 'sample_submission.csv',
 'order_products__train.csv',
 'order_products__prior.csv',
 'orders.csv',
 'products.csv',
 'aisles.csv']

In [3]:
departments = pd.read_csv('./'+files[0])
order_products_train = pd.read_csv('./'+files[2], dtype = {'order_id' : np.int32, 'product_id' : np.int32,
                                                           'add_to_cart_order' : np.int16, 'reordered' : np.int16})
order_products_prior = pd.read_csv('./'+files[3], dtype = {'order_id' : np.int32, 'product_id' : np.int32,
                                                           'add_to_cart_order' : np.int16, 'reordered' : np.int16})
orders = pd.read_csv('./'+files[4], dtype = {'order_id' : np.int32,'user_id' : np.int32,
                                             'order_number' : np.int16,'order_dow' : np.int16,
                                             'order_hour_of_day' : np.int16})
products = pd.read_csv('./'+files[5])
aisles = pd.read_csv('./'+files[6])

# 쿼리데이터 만들기
- products_detail : products, aisles, department 합친 것
- order_prior/train : order_products_prior/train 에 user_id 추가한 것
- train/test_users : 학습/테스트 에 사용되는 유저 목록
- order_prior_train : train 유저의 과거 주문들
- order_prior_train_all : order_prior_train + order_train
- order_prior_detail : order_prior + orders **by order_id**

In [4]:
products_detail = pd.merge(products, aisles, on='aisle_id')
products_detail = pd.merge(products_detail, departments, on='department_id')

order_prior = pd.merge(order_products_prior, orders[['order_id', 'user_id']], on='order_id')
order_train = pd.merge(order_products_train, orders[['order_id', 'user_id']], on='order_id')

train_users = orders[orders['eval_set']=='train']['user_id'].values
# train_users = np.sort(train_users) # train_user 에 해당하는 유저 목록
test_users = orders[orders['eval_set']=='test']['user_id'].values
# test_users = np.sort(test_users)

order_prior_train = order_prior[order_prior['user_id'].isin(train_users)] # train 유저의 과거 주문
# order_prior_train_all = pd.concat([order_prior_train, order_train]) # train 유저에 대한 order 정보 취합

In [5]:
## 전체 물건에 대해서 과거 시점들의 평균값들을 구해보자
order_prior_detail = pd.merge(order_prior, orders.drop(['user_id','eval_set'], axis=1), on='order_id')
order_prior_detail

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,3,5,9,8.0
1,2,28985,2,1,202279,3,5,9,8.0
2,2,9327,3,0,202279,3,5,9,8.0
3,2,45918,4,1,202279,3,5,9,8.0
4,2,30035,5,0,202279,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,24,2,6,21.0
32434485,3421083,11352,7,0,25247,24,2,6,21.0
32434486,3421083,4600,8,0,25247,24,2,6,21.0
32434487,3421083,24852,9,1,25247,24,2,6,21.0


In [6]:
product_reorder = order_prior_detail.groupby('product_id').sum()[['reordered']]
len(product_reorder),len(products_detail)

(49677, 49688)

In [7]:
# 과거에 한번도 주문되지 않은 데이터들 
ordered_product = product_reorder.index
all_product = products_detail.product_id
not_ordered_product = [a for a in all_product if a not in ordered_product]
not_ordered_product_detail = products_detail[products_detail['product_id'].isin(not_ordered_product)]

# 주문되지 않은 물건이 train(가장 최근 주문_학습용)에서 주문 된 게 있을까? - Yes
display(order_train[order_train['product_id'].isin(not_ordered_product)])

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id
318095,774603,25383,1,0,160879
549385,1351798,49540,14,0,20878
745014,1832341,45971,2,0,20604
879484,2169250,3718,9,0,153763
919518,2269288,27499,31,0,15012
920492,2271346,37703,15,0,169335
1014221,2507248,27499,9,0,46748
1216229,3004244,43725,25,0,169709
1365688,3376312,36233,6,0,145958


In [8]:
# 평균 구매 시간대, 요일, add_to_cart_order 은 groupby 써서 가면 될 듯
product_mean = order_prior_detail.groupby('product_id').mean()[['add_to_cart_order', 'order_dow','order_hour_of_day']].reset_index()
product_mean.head()

Unnamed: 0,product_id,add_to_cart_order,order_dow,order_hour_of_day
0,1,5.801836,2.776458,13.238121
1,2,9.888889,2.922222,13.277778
2,3,6.415162,2.736462,12.104693
3,4,9.507599,2.683891,13.714286
4,5,6.466667,2.733333,10.666667


In [9]:
# 재주문이 어느정도로 되는지 확인해보자
a = order_prior_detail[order_prior_detail['days_since_prior_order'].notna()].groupby('reordered').count()['product_id']
a.apply(lambda x : x/a.sum())

reordered
0    0.369934
1    0.630066
Name: product_id, dtype: float64

# 관점 Ⅰ 기존에 샀던 물건들 중에서 재구매를 할까? 
- 기존에 샀던 물건들에 대한 특성이 필요함
    - 재구매율
    - 과거 구매 물품 중 구매 횟수
    - 물건 구매 주기
    - 물건 구매 평균 시간
- 기존에 샀던 물건들 끼리의 유사도는?
    - 유사도
    - 유사한 것 끼리 중복해서 사는 정도가 있었나?
        - 만약, 초코쿠키를 샀었는데 다음엔 초코칩쿠키를 샀다던지

### 함수화 하기 이전에, user_id==1 인 경우를 먼저 파악해보자

In [10]:
user1_orders = order_prior_train[order_prior_train['user_id']==1]
user1_orders = pd.merge(user1_orders,orders[['order_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']], on='order_id')
user1_orders.sort_values(by=['order_number','add_to_cart_order'], inplace=True)

user1_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
35,2539329,196,1,0,1,1,2,8,
36,2539329,14084,2,0,1,1,2,8,
37,2539329,12427,3,0,1,1,2,8,
38,2539329,26088,4,0,1,1,2,8,
39,2539329,26405,5,0,1,1,2,8,


In [11]:
# 유저가 과거에 산 물건 종류
user1_prior_products = user1_orders.product_id.unique()
# 유저가 과거에 주문한 횟수
user1_orders_count = len(user1_orders.order_id.unique())
# 구매한 물건 별로 재주문 횟수
user1_products = user1_orders.groupby('product_id').sum()[['reordered']].reset_index()
user1_products = pd.merge(user1_products, products_detail[['product_id', 'product_name']], on='product_id')

# 기존 물건 별 재주문율
user1_products['reordered_ratio'] = user1_products['reordered'].apply(lambda x : x /(len(user1_orders.order_id.unique())-1))
# 과거에 총 몇 번 구매를 했는가? 
user1_products['order_count'] = user1_products['reordered']+1
# 구매 물건 총 개수 중 이 물건은 어느 정도 비중을 차지하는 가? 
user1_products['order_ratio'] = user1_products['order_count'].apply(lambda x : x/len(user1_orders))

# 해당 물건의 물건 평균 주기는 어떻게 되는가? 
user1_products['product_buy_term'] = user1_products['product_id'].apply(lambda x : user1_orders[(user1_orders['product_id'] == x) \
                                                                                                & (user1_orders['days_since_prior_order'] > 0)]['days_since_prior_order'].mean() \
                                                                       if len(user1_orders[(user1_orders['product_id'] == x)]) > 1 else np.nan)
# 해당 물건을 사는 데 있어서 평균 시간대는 어떻게 되는가?
user1_products['product_buy_time'] = user1_products['product_id'].apply(lambda x : user1_orders[(user1_orders['product_id'] == x)]['order_hour_of_day'].mean())

user1_products.head()

Unnamed: 0,product_id,reordered,product_name,reordered_ratio,order_count,order_ratio,product_buy_term,product_buy_time
0,196,9,Soda,1.0,10,0.169492,22.0,10.3
1,10258,8,Pistachios,0.888889,9,0.152542,22.0,10.555556
2,10326,0,Organic Fuji Apples,0.0,1,0.016949,,15.0
3,12427,9,Original Beef Jerky,1.0,10,0.169492,22.0,10.3
4,13032,2,Cinnamon Toast Crunch,0.222222,3,0.050847,21.666667,8.0


In [12]:
# user_1 이 최종적으로 산 물건은
user1_buy_final = order_train[order_train['user_id']==1].product_id.values

# user_1이 기존에 산 물건들 끼리의 유사도를 구해보자 product_name + aisle or department어서 생각해야할 듯
products_detail['product_name_lower'] = products_detail['product_name'].apply(lambda x : x.lower())
products_detail['name+aisle'] = products_detail['product_name_lower'] + ' ' + products_detail['aisle']
products_detail['name+department'] = products_detail['product_name_lower'] + ' ' + products_detail['department']
products_detail['name+department+aisle'] = products_detail['product_name_lower'] + ' ' + products_detail['department'] + ' ' + products_detail['aisle']

# user_1 이 과거에 산 물건 이름
user1_prior_products_name = products_detail[products_detail['product_id'].isin(user1_prior_products)]['name+department']

In [13]:
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
product_mat = count_vect.fit_transform(user1_prior_products_name)

product_sim = cosine_similarity(product_mat, product_mat)
product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user1_prior_products)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user1_prior_products)]['product_name'])

In [14]:
user1_products['best_similar'] = user1_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[1])
user1_products['best_similarity'] = user1_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[1])
user1_products

Unnamed: 0,product_id,reordered,product_name,reordered_ratio,order_count,order_ratio,product_buy_term,product_buy_time,best_similar,best_similarity
0,196,9,Soda,1.0,10,0.169492,22.0,10.3,Zero Calorie Cola,0.218218
1,10258,8,Pistachios,0.888889,9,0.152542,22.0,10.555556,Original Beef Jerky,0.218218
2,10326,0,Organic Fuji Apples,0.0,1,0.016949,,15.0,Honeycrisp Apples,0.507093
3,12427,9,Original Beef Jerky,1.0,10,0.169492,22.0,10.3,Pistachios,0.218218
4,13032,2,Cinnamon Toast Crunch,0.222222,3,0.050847,21.666667,8.0,Original Beef Jerky,0.0
5,13176,1,Bag of Organic Bananas,0.111111,2,0.033898,21.5,11.0,Organic Fuji Apples,0.251976
6,14084,0,Organic Unsweetened Vanilla Almond Milk,0.0,1,0.016949,,8.0,Organic Unsweetened Almond Milk,0.836242
7,17122,0,Honeycrisp Apples,0.0,1,0.016949,,15.0,Organic Fuji Apples,0.507093
8,25133,7,Organic String Cheese,0.777778,8,0.135593,23.0,11.0,Organic Unsweetened Almond Milk,0.402015
9,26088,1,Aged White Cheddar Popcorn,0.111111,2,0.033898,15.0,7.5,Pistachios,0.19245


- 상품 구매의 흐름 데이터도 넣어서 좀 더 확인해볼 가치가 있음

# 관점 Ⅱ 기존에 샀던 물건들 바탕으로 새로운 물건을 산다면 어떤 물건을 살 것인가?
- 재구매 가능성이 높은 물건과 유사한 물건을 사지 않았을까?
    - product name의 유사도 파악

In [15]:
# 새로운 물건이 있나요? 
user1_buy_new = []
for prod in user1_buy_final:
    if prod not in user1_products.product_id.values:
        print(prod)
        user1_buy_new.append(prod)

27845


In [16]:
# 결과적으로 유저 1이 산 목록들 전부 취합
user1_buy_all = user1_prior_products.tolist() + user1_buy_new

In [17]:
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
product_mat = count_vect.fit_transform(products_detail[products_detail['product_id'].isin(user1_buy_all)]['name+department'])

product_sim = cosine_similarity(product_mat, product_mat)
product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user1_buy_all)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user1_buy_all)]['product_name'])

In [18]:
user1_buy_new_name = products_detail[products_detail['product_id'].isin(user1_buy_new)]['product_name'].values


In [19]:
sum(product_similarity[products_detail[products_detail['product_id'].isin(user1_buy_new)]['product_name'].values].values>0.5)[0]-1 # 유사도가 0.5 보다 큰게 2개 있다

2

# 관점들을 전부 함수화

In [20]:
#관점 1
def make_user_df(user_id):
    user_orders = order_prior_train[order_prior_train['user_id']==user_id]
    user_orders = pd.merge(user_orders,orders[['order_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']], on='order_id')
    user_orders.sort_values(by=['order_number','add_to_cart_order'], inplace=True)
    
    # 유저가 과거에 산 물건 종류 (id 로)
    user_prior_products = user_orders.product_id.unique()
    # 유저가 과거에 주문한 횟수 
    user_orders_count = len(user_orders.order_id.unique())
    # 구매한 물건 별로 재주문 횟수
    user_products = user_orders.groupby('product_id').sum()[['reordered']].reset_index()
    user_products = pd.merge(user_products, products_detail[['product_id', 'product_name']], on='product_id')

    # 기존 물건 별 재주문율
    user_products['reordered_ratio'] = user_products['reordered'].apply(lambda x : x /(user_orders_count-1))
    # 과거에 총 몇 번 구매를 했는가? 
    user_products['order_count'] = user_products['reordered']+1
    # 구매 물건 총 개수 중 이 물건은 어느 정도 비중을 차지하는 가? 
    user_products['order_ratio'] = user_products['order_count'].apply(lambda x : x/len(user_orders))

    # 해당 물건의 물건 평균 주기는 어떻게 되는가? 
    user_products['product_buy_term'] = user_products['product_id'].apply(lambda x : user_orders[(user_orders['product_id'] == x) \
                                                                                                    & (user_orders['days_since_prior_order'] > 0)]['days_since_prior_order'].mean() \
                                                                           if len(user_orders[(user_orders['product_id'] == x)]) > 1 else np.nan)
    # 해당 물건을 사는 데 있어서 평균 시간대는 어떻게 되는가?
    user_products['product_buy_time'] = user_products['product_id'].apply(lambda x : user_orders[(user_orders['product_id'] == x)]['order_hour_of_day'].mean())
    
    # 과거에 산 물건 이름 + 대분류
    user_prior_products_name = products_detail[products_detail['product_id'].isin(user_prior_products)]['name+department']
    
    # 기존 물건들 사이의 유사도 
    count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
    product_mat = count_vect.fit_transform(user_prior_products_name)

    product_sim = cosine_similarity(product_mat, product_mat)
    product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user_prior_products)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user_prior_products)]['product_name'])
    # print(len(user_prior_products_name)) 
    user_products['best1_similar'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[1] \
                                                                         if len(user_prior_products_name)>1 else product_similarity[x].index[0])
    user_products['best1_similarity'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[1]\
                                                                           if len(user_prior_products_name)>1 else product_similarity[x][0])
    
#     user_products['best2_similar'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[2])
#     user_products['best2_similarity'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[2])
    
    return user_products

In [21]:
# 관점2
def why_new_product(user_id):
    prior_products = order_prior[order_prior['user_id']==user_id].product_id.unique()
    final_products = order_train[order_train['user_id']==user_id].product_id.values
    
    user_buy_new = []
    for prod in final_products:
        if prod not in prior_products:
            user_buy_new.append(prod)
    
    user_buy_all = prior_products.tolist() + user_buy_new
    count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
    product_mat = count_vect.fit_transform(products_detail[products_detail['product_id'].isin(user_buy_all)]['name+department'])

    product_sim = cosine_similarity(product_mat, product_mat)
    product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user_buy_all)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user_buy_all)]['product_name'])
    
    return product_similarity[products_detail[products_detail['product_id'].isin(user_buy_new)]['product_name']],user_buy_new

In [22]:
why_new_product(1)[1]

[27845]

## '재구매율' 은 어느정도로 영향을 줄까?

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [24]:
user1_buy_new = order_train[order_train['user_id']==10].product_id.values
user1_df = make_user_df(10)

print(user1_buy_new)
user1_df.product_id.values

[29650 48720 24654 10177]


array([  260,  1529,  4920,  5450,  5646,  5769,  5818,  7632,  7746,
        8988,  9339,  9871, 11068, 11782, 13083, 13198, 13212, 13512,
       13629, 13829, 14992, 15011, 15290, 15392, 15937, 16185, 16797,
       16857, 17794, 17828, 18441, 18656, 19019, 19678, 20632, 20920,
       20995, 21174, 21833, 22035, 22825, 23165, 23541, 23879, 24184,
       24852, 25931, 26209, 26940, 27104, 27156, 28535, 28842, 28928,
       28986, 30305, 30489, 31506, 31717, 32299, 32537, 34126, 34128,
       34358, 35725, 35973, 36695, 36735, 36865, 37687, 38293, 39877,
       40604, 40706, 41220, 42342, 42625, 42647, 42736, 43014, 43352,
       44359, 44910, 45007, 45664, 46979, 47042, 47380, 47526, 47591,
       47626, 47788, 48204, 48775])

In [25]:
user1_df['final_reordered'] = user1_df['product_id'].apply(lambda x : 1 if x in user1_buy_new else 0)
x = user1_df[['reordered_ratio']]
y = user1_df['final_reordered']

model = RandomForestClassifier()
model.fit(x,y)

print(f'Accuracy : {accuracy_score(model.predict(x), y)}')
print(f'F1-score : {f1_score(model.predict(x), y)}')
print(f'Recall : {recall_score(model.predict(x), y)}')
print(f'Precision : {precision_score(model.predict(x), y)}')

Accuracy : 1.0
F1-score : 0.0
Recall : 0.0
Precision : 0.0


  average, "true nor predicted", 'F-score is', len(true_sum)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
model.predict(x), y.values

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]))

In [27]:
def reordered_ratio_effect(user_id):
    user_buy_new = order_train[order_train['user_id']==user_id].product_id.values
    user_df = make_user_df(user_id)
    
    user_df['final_reordered'] = user_df['product_id'].apply(lambda x : 1 if x in user_buy_new else 0)
    x = user_df[['reordered_ratio']]
    y = user_df['final_reordered']
    
    model = RandomForestClassifier()
    model.fit(x,y)
    
    print(f'Accuracy : {accuracy_score(model.predict(x), y)}')
    print(f'F1-score : {f1_score(model.predict(x), y)}')
    print(f'Recall : {recall_score(model.predict(x), y)}')
    print(f'Precision : {precision_score(model.predict(x), y)}')
    
    return accuracy_score(model.predict(x),y), f1_score(model.predict(x),y)

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
order_prior.groupby('user_id').count()[['product_id']]

Unnamed: 0_level_0,product_id
user_id,Unnamed: 1_level_1
1,59
2,195
3,88
4,18
5,37
...,...
206205,32
206206,285
206207,223
206208,677


In [30]:
# user 표본을 구하기 위해서 user 정보 테이블을 만들어보자
# user 당 주문 횟수
# user 당 평균 구매 물건 수

tmp = order_prior.groupby('user_id').count()[['product_id']].reset_index()
tmp = tmp[tmp['user_id'].isin(train_users)]
train_user = orders.groupby('user_id').count()[['order_id']].reset_index()
train_user = train_user[train_user['user_id'].isin(train_users)]
train_user = train_user.reset_index(drop=True)
train_user = pd.merge(train_user, tmp, on='user_id')
train_user.columns = ['user_id','주문 횟수', '구매 물건 수']
train_user['평균 구매 물건 수'] = train_user['구매 물건 수'] / train_user['주문 횟수']
train_user.head()

Unnamed: 0,user_id,주문 횟수,구매 물건 수,평균 구매 물건 수
0,1,11,59,5.363636
1,2,15,195,13.0
2,5,5,37,7.4
3,7,21,206,9.809524
4,8,4,49,12.25


In [31]:
test = []
for i,user in enumerate([259,358,513,540]):
    # print(f'user_{user}>>')
    try:
        acc, f1 = reordered_ratio_effect(user)
        acc_f1.append([acc,f1])
    except:
        print(f'user_{user} 에서 에러 발생')
        break
print('finished')

Accuracy : 1.0
F1-score : 0.0
Recall : 0.0
Precision : 0.0
user_259 에서 에러 발생
finished


In [32]:
acc_f1 = []
for i,user in enumerate(train_users[:500]):
    # print(f'user_{user}>>')
    try:
        acc, f1 = reordered_ratio_effect(user)
        acc_f1.append([acc,f1])
    except:
        print(f'user_{user} 에서 에러 발생')
        break
    if (i+1) % 100 == 0:
        print(round((i+1)/500*100, 2),'% 진행')
print('finished')

Accuracy : 0.7777777777777778
F1-score : 0.7777777777777777
Recall : 0.875
Precision : 0.7
Accuracy : 0.9019607843137255
F1-score : 0.2857142857142857
Recall : 1.0
Precision : 0.16666666666666666
Accuracy : 0.8260869565217391
F1-score : 0.0
Recall : 0.0
Precision : 0.0
Accuracy : 0.9264705882352942
F1-score : 0.5454545454545454
Recall : 1.0
Precision : 0.375
Accuracy : 0.8888888888888888
F1-score : 0.0
Recall : 0.0
Precision : 0.0
Accuracy : 0.6379310344827587
F1-score : 0.46153846153846156
Recall : 0.5294117647058824
Precision : 0.4090909090909091
Accuracy : 1.0
F1-score : 0.0
Recall : 0.0
Precision : 0.0
Accuracy : 0.9655172413793104
F1-score : 0.8571428571428571
Recall : 1.0
Precision : 0.75
Accuracy : 0.9577464788732394
F1-score : 0.4
Recall : 1.0
Precision : 0.25
Accuracy : 0.9879518072289156
F1-score : 0.8
Recall : 1.0
Precision : 0.6666666666666666
Accuracy : 0.896551724137931
F1-score : 0.5714285714285715
Recall : 1.0
Precision : 0.4
Accuracy : 0.9509803921568627
F1-score : 0.0

In [33]:
# user 500 명에 대한 재구매율 feature 효율
user500 = pd.DataFrame(acc_f1, columns = ['ACC', 'F1'])
user500['user_id'] = train_users[:500]
user500 = user500[['user_id', 'ACC','F1']]
user500.describe()

Unnamed: 0,user_id,ACC,F1
count,500.0,500.0,500.0
mean,392.678,0.910283,0.376837
std,225.990118,0.080852,0.331607
min,1.0,0.5625,0.0
25%,196.5,0.866667,0.0
50%,395.5,0.931034,0.4
75%,582.5,0.971252,0.666667
max,781.0,1.0,1.0


In [34]:
train_user = pd.merge(user500, train_user, on='user_id')

In [35]:
# 정확도 1, F1-score 0
k = train_user[(train_user['F1']==0) & (train_user['ACC']==1)].user_id.values
for i in k:
    # 마지막에 산 물건
    new_buy = order_train[order_train['user_id']==i].product_id.values
    
    # 기존 구매내역
    user_df = make_user_df(i)
    prior_buy = user1_df.product_id
    
    print(prior_buy.isin(new_buy).sum())

0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
2
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [36]:
train_user.sort_values(by='F1',ascending=False)

Unnamed: 0,user_id,ACC,F1,주문 횟수,구매 물건 수,평균 구매 물건 수
422,669,1.000000,1.0,29,289,9.965517
217,345,1.000000,1.0,4,13,3.250000
408,636,1.000000,1.0,22,30,1.363636
320,509,1.000000,1.0,12,156,13.000000
448,708,1.000000,1.0,15,40,2.666667
...,...,...,...,...,...,...
248,393,0.913386,0.0,11,215,19.545455
249,395,0.944444,0.0,4,18,4.500000
252,401,0.868421,0.0,5,45,9.000000
255,404,0.977273,0.0,7,46,6.571429
