<a href="https://colab.research.google.com/github/JongHyun2332/r1/blob/master/%EC%A4%91%EA%B0%84%EC%A0%80%EC%9E%A51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **데이터 불러오기 및 기본 설정**

In [3]:
#기본설정
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()
import warnings
warnings.filterwarnings('ignore')
# Garbage Collector to free up memory
import gc                         
gc.enable() 

In [4]:
#데이터 불러오기
orders = pd.read_csv('orders.csv')
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products_train = pd.read_csv('order_products__train.csv')
products = pd.read_csv('products.csv')
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')

In [5]:
#데이터 크기 줄이기
def int_memory_reduce(data) :
    data_int = data.select_dtypes(include=['int'])
    converted_int = data_int.apply(pd.to_numeric,downcast='unsigned')
    print(f"Before : {mem_usage(data_int)} -> After : {mem_usage(converted_int)}")
    data[converted_int.columns] = converted_int
    return data

In [6]:
def float_memory_reduce(data) :
    data_float = data.select_dtypes(include=['float'])
    converted_float = data_float.apply(pd.to_numeric,downcast='float')
    print(f"Before : {mem_usage(data_float)} -> After : {mem_usage(converted_float)}")
    data[converted_float.columns] = converted_float
    return data

def object_memory_reduce(data) :
    gl_obj = data.select_dtypes(include=['object']).copy()
    converted_obj = pd.DataFrame()
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:,col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:,col] = gl_obj[col]
    print(f"Before : {mem_usage(gl_obj)} -> After : {mem_usage(converted_obj)}")
    data[converted_obj.columns] = converted_obj
    return data

In [7]:
#데이터 확인
#orders.head()
#order_products_prior.head()
#order_products_train.head()
#products.head()
#aisles.head()
#departments.head()
#order_products_train.shape
#order_products_train.isnull().sum()

# **데이터 양 줄이기**
데이터 크기는 줄였으나 워낙 데이터가 많아서 나중에 profiling 할 때 시간이 너무 오래 걸림(30분 넘어가고 빡쳐서 종료함). 
데이터가 충분히 크기 때문에 일부분을 랜덤하게 추출하여 사용.

In [8]:
#원 데이터의 20%만 사용하기. random_state 를 변경해주면 뽑는 데이터가 랜덤하게 바뀜. 나는 23을 좋아하니까 23으로 하겠음.
orders = orders.loc[orders.user_id.isin(orders.user_id.drop_duplicates().sample(frac=0.20, random_state=23))] 

# **Feature 설정**

1. 물건 재주문율
3. 구매횟수
4. 장바구니에 담은 횟수
5. 장바구니에 담은 평균 횟수
6. 장바구니에 담긴 순서의 평균
7. 구매비율 (특정 상품 구매 횟수 / 구매횟수)
8. 장바구니에 담고 구매한 비율
9. 장바구니에 담고 구매하지 않은 비율

1. 유저의 재주문 횟수
2. 유저의 재주문율
3. 

# 1, 2 유저의 재주문 횟수, 재주문율

1. 유저의 재주문 횟수

In [89]:
#order를 보면 oreder_number가 주문 횟수임.
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
50,2086598,6,prior,1,5,18.0,
51,298250,6,prior,2,4,16.0,6.0
52,998866,6,prior,3,2,18.0,12.0
53,1528013,6,test,4,3,16.0,22.0
79,280530,9,prior,1,1,17.0,


In [90]:
# order number 최대값 = 재주문 최대값.
prior_orders = pd.merge(orders, order_products_prior, on='order_id', how='inner')
prior_orders.head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,68288,10,prior,2,5,15.0,30.0,46979.0,1.0,1.0
1,68288,10,prior,2,5,15.0,30.0,47380.0,2.0,0.0
2,68288,10,prior,2,5,15.0,30.0,20995.0,3.0,0.0
3,68288,10,prior,2,5,15.0,30.0,43014.0,4.0,0.0
4,68288,10,prior,2,5,15.0,30.0,15011.0,5.0,0.0
5,68288,10,prior,2,5,15.0,30.0,27156.0,6.0,0.0
6,68288,10,prior,2,5,15.0,30.0,13512.0,7.0,0.0
7,68288,10,prior,2,5,15.0,30.0,40604.0,8.0,0.0
8,68288,10,prior,2,5,15.0,30.0,13198.0,9.0,0.0
9,68288,10,prior,2,5,15.0,30.0,9339.0,10.0,0.0


In [118]:
users_order_number = prior_orders.groupby(by='user_id')['order_number'].aggregate('max').to_frame('users_order_time').reset_index()
final_data = users_order_number
users_order_number.head()
#final_data.head()

Unnamed: 0,user_id,users_order_time
0,10,4
1,19,9
2,68,2
3,75,12
4,90,62


2. 유저의 재주문율

In [104]:
#Get total number of orders by each user
total_orders = prior_orders.groupby('user_id')['order_number'].max().to_frame('total_orders').reset_index()
total_orders.head()

Unnamed: 0,user_id,total_orders
0,10,4
1,19,9
2,68,2
3,75,12
4,90,62


In [93]:
#Create a dataframe  to find a product's order number when the user has bought a product for the first time.
product_first_order_num = prior_orders.groupby(by=['user_id', 'product_id'])['order_number'].aggregate('min').to_frame('first_order_number').reset_index()
product_first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_number
0,10,260.0,4
1,10,1529.0,2
2,10,5646.0,2
3,10,5818.0,2
4,10,7632.0,2


In [94]:
#Get total number of orders by each user
total_orders = prior_orders.groupby('user_id')['order_number'].max().to_frame('total_orders').reset_index()
total_orders.head()

Unnamed: 0,user_id,total_orders
0,10,4
1,19,9
2,68,2
3,75,12
4,90,62


In [95]:
# Merge total_orders and user_product_data dataframes to create a new dataframe user_product_df
user_product_df = pd.merge(total_orders, product_first_order_num, on='user_id', how='right')
user_product_df.head()

Unnamed: 0,user_id,total_orders,product_id,first_order_number
0,10,4,260.0,4
1,10,4,1529.0,2
2,10,4,5646.0,2
3,10,4,5818.0,2
4,10,4,7632.0,2


In [96]:
# Calculate the order range.
# The +1 includes in the difference is the first order where the product has been purchased
user_product_df['order_range'] = user_product_df['total_orders'] - user_product_df['first_order_number'] + 1
user_product_df.head()

Unnamed: 0,user_id,total_orders,product_id,first_order_number,order_range
0,10,4,260.0,4,1
1,10,4,1529.0,2,3
2,10,4,5646.0,2,3
3,10,4,5818.0,2,3
4,10,4,7632.0,2,3


In [97]:
#Create  a dataframe to show the number of times a user have bough a product.
number_of_times = prior_orders.groupby(by=['user_id', 'product_id'])['order_id'].aggregate('count').to_frame('times_bought').reset_index()
number_of_times.head()

Unnamed: 0,user_id,product_id,times_bought
0,10,260.0,1
1,10,1529.0,2
2,10,5646.0,1
3,10,5818.0,1
4,10,7632.0,1


In [116]:
# Merging number_of_times with user_product_df
uxp_ratio = pd.merge(number_of_times, user_product_df, on=['user_id', 'product_id'], how='left')
uxp_ratio.head()

Unnamed: 0,user_id,product_id,times_bought,total_orders,first_order_number,order_range
0,10,260.0,1,4,4,1
1,10,1529.0,2,4,2,3
2,10,5646.0,1,4,2,3
3,10,5818.0,1,4,2,3
4,10,7632.0,1,4,2,3


In [117]:
# Get a dataframe to calculate the reorder ratio for each product
uxp_ratio['users_reorder_ratio'] = uxp_ratio['times_bought'] / uxp_ratio['order_range']
uxp_ratio.head()

Unnamed: 0,user_id,product_id,times_bought,total_orders,first_order_number,order_range,users_reorder_ratio
0,10,260.0,1,4,4,1,1.0
1,10,1529.0,2,4,2,3,0.666667
2,10,5646.0,1,4,2,3,0.333333
3,10,5818.0,1,4,2,3,0.333333
4,10,7632.0,1,4,2,3,0.333333


In [119]:
final_data = pd.merge(final_data, uxp_ratio, on=['user_id'], how='left')
final_data.head()

Unnamed: 0,user_id,users_order_time,product_id,times_bought,total_orders,first_order_number,order_range,users_reorder_ratio
0,10,4,260.0,1,4,4,1,1.0
1,10,4,1529.0,2,4,2,3,0.666667
2,10,4,5646.0,1,4,2,3,0.333333
3,10,4,5818.0,1,4,2,3,0.333333
4,10,4,7632.0,1,4,2,3,0.333333
