# 五、創建額外檔案
<hr style="border:2px solid gray">

In [1]:
# 載入所需套件
import pandas as pd
import pickle
import re

In [2]:
# 讀取數據
order_products_prior_df = pd.read_csv('./datasets/order_products__prior.csv')
order_products_train_df = pd.read_csv('./datasets/order_products__train.csv')
orders_df = pd.read_csv('./datasets/orders.csv')
products_df = pd.read_csv('./datasets/products.csv')
aisles_df = pd.read_csv('./datasets/aisles.csv')
departments_df = pd.read_csv('./datasets/departments.csv')

In [3]:
# 合併先前訂單和過去購買的產品數據
prd = orders_df.merge(order_products_prior_df, on='order_id', how='inner')
prd = prd.merge(products_df, on='product_id', how='left')
prd = prd.merge(aisles_df, on='aisle_id', how='left')
prd = prd.merge(departments_df, on='department_id', how='left')
print("Shape of prd :", prd.shape)
prd.head()

Shape of prd : (32434489, 15)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,soft drinks,beverages
1,2539329,1,prior,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,soy lactosefree,dairy eggs
2,2539329,1,prior,1,2,8,,12427,3,0,Original Beef Jerky,23,19,popcorn jerky,snacks
3,2539329,1,prior,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,popcorn jerky,snacks
4,2539329,1,prior,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,paper goods,household


# 5-1. 創建產品 ID 及其名稱的 pkl 檔
<hr style="border:2px solid gray">

In [4]:
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [5]:
# 將 '%' 替換為 'percent'
products_df['product_name'] = products_df['product_name'].str.replace('%', 'percent')
# 將 '-' 替換為 ' '
products_df['product_name'] = products_df['product_name'].str.replace('%', ' ')
# 清理產品名稱，去除逗號和其他不需要的符號
products_df['product_name'] = products_df['product_name'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x))

In [6]:
products_id_name = products_df[['product_id', 'product_name']]
products_id_name

Unnamed: 0,product_id,product_name
0,1,Chocolate Sandwich Cookies
1,2,AllSeasons Salt
2,3,Robust Golden Unsweetened Oolong Tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...
4,5,Green Chile Anytime Sauce
...,...,...
49683,49684,Vodka Triple Distilled Twist of Vanilla
49684,49685,En Croute Roast Hazelnut Cranberry
49685,49686,Artisan Baguette
49686,49687,Smartblend Healthy Metabolism Dry Cat Food


In [7]:
products_id_name.to_pickle("./datasets/products_id_name.pkl")

# 5-2. 創建該禮拜及時間前十熱銷產品名稱的 pkl 檔
<hr style="border:2px solid gray">

In [8]:
x = prd.groupby(['order_dow', 'order_hour_of_day', 'product_name']).size().reset_index(name='count')
x = x.sort_values(by=['order_dow', 'order_hour_of_day', 'count'], ascending=False)
top10 = x.groupby(['order_dow', 'order_hour_of_day']).head(10).reset_index(drop=True)
top10[(top10['order_dow'] == 0) & (top10['order_hour_of_day'] == 10)]

Unnamed: 0,order_dow,order_hour_of_day,product_name,count
1570,0,10,Banana,8075
1571,0,10,Bag of Organic Bananas,5656
1572,0,10,Organic Baby Spinach,4659
1573,0,10,Organic Strawberries,4238
1574,0,10,Organic Hass Avocado,3444
1575,0,10,Organic Avocado,3269
1576,0,10,Large Lemon,2927
1577,0,10,Limes,2838
1578,0,10,Organic Yellow Onion,2333
1579,0,10,Organic Garlic,2251


In [9]:
top10.to_pickle("./datasets/top10_products.pkl")

# 5-3. 創建用戶最後一次購買的日期 pkl 檔
<hr style="border:2px solid gray">

給定用戶最後一次訂購的日期為部署的日期 2024-06-14。

In [10]:
user_last_purchase = pd.DataFrame(columns = ['user_id','date'])
user_last_purchase['user_id'] = orders_df['user_id'].unique()
user_last_purchase['date'] = '2024-06-14'
user_last_purchase

Unnamed: 0,user_id,date
0,1,2024-06-14
1,2,2024-06-14
2,3,2024-06-14
3,4,2024-06-14
4,5,2024-06-14
...,...,...
206204,206205,2024-06-14
206205,206206,2024-06-14
206206,206207,2024-06-14
206207,206208,2024-06-14


In [11]:
user_last_purchase.to_pickle("./datasets/user_last_purchase.pkl")