In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import data_table
data_table.enable_dataframe_formatter()

In [None]:
import pandas as pd
import numpy as np

In [None]:
# 데이터셋 로드
df =  pd.read_csv('/content/drive/MyDrive/cp2_log_data.csv', parse_dates=['event_time'])


In [None]:
# 칼럼 평점화 진행
df_v = df.copy()

# 필요한 컬럼만 사용
df_v = df_v[['user_id','category_code','event_type']]

In [None]:
df_v = df_v.drop_duplicates()
df_v.event_type.value_counts()

view        5794964
purchase     403409
cart         376926
Name: event_type, dtype: int64

In [None]:
event_type_strength = {
   'view': 1.5,
   'cart': 1.6, 
   'purchase': 1.9
}

df_v['event_type'] = df_v['event_type'].apply(lambda x: event_type_strength[x])

# 참고
# https://www.kaggle.com/code/gspmoreira/recommender-systems-in-python-101/notebook.

In [None]:
# 선호도 확인을 위한 그룹화 진행

df_v_g = df_v.groupby(['user_id','category_code'])['event_type'].sum().reset_index()
df_v_g



Unnamed: 0,user_id,category_code,event_type
0,33869381,kids.carriage,1.5
1,184265397,furniture.living_room.chair,1.5
2,184265397,unknown.unknown.unknown,1.5
3,195082191,electronics.audio.headphone,1.5
4,200673532,computers.components.motherboard,1.5
...,...,...,...
5795272,566280536,furniture.living_room.sofa,1.5
5795273,566280663,electronics.smartphone,1.5
5795274,566280676,furniture.bedroom.bed,1.5
5795275,566280697,electronics.camera.video,1.5


In [None]:
# 초기 데이터셋에 평점 붙이기

df_v_m = pd.merge(left=df, right=df_v_g,how='left',\
                    on=['user_id','category_code'],sort=False)

df_v_m



Unnamed: 0.1,Unnamed: 0,event_time,event_type_x,product_id,category_id,category_code,brand,price,user_id,user_session,division1,division2,division3,event_type_y
0,0,2019-10-01 00:00:00+00:00,view,44600062,2103807459595387724,unknown.unknown.unknown,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,unknown,unknown,unknown,3.4
1,1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.20,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater,1.5
2,2,2019-10-01 00:00:01+00:00,view,17200506,2053013559792632471,furniture.living_room.sofa,unknown,543.10,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa,1.5
3,3,2019-10-01 00:00:01+00:00,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,computers,notebook,notebook,1.5
4,4,2019-10-01 00:00:04+00:00,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,electronics,smartphone,smartphone,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38672585,42448759,2019-10-31 23:59:58+00:00,view,2300275,2053013560530830019,electronics.camera.video,gopro,527.40,537931532,22c57267-da98-4f28-9a9c-18bb5b385193,electronics,camera,video,1.5
38672586,42448760,2019-10-31 23:59:58+00:00,view,10800172,2053013554994348409,unknown.unknown.unknown,redmond,61.75,527322328,5054190a-46cb-4211-a8f1-16fc1a060ed8,unknown,unknown,unknown,1.5
38672587,42448761,2019-10-31 23:59:58+00:00,view,5701038,2053013553970938175,auto.accessories.player,kenwood,128.70,566280422,05b6c62b-992f-4e8e-91f7-961bcb4719cd,auto,accessories,player,1.5
38672588,42448762,2019-10-31 23:59:59+00:00,view,21407424,2053013561579406073,electronics.clocks,tissot,689.85,513118352,4c14bf2a-2820-4504-929d-046356a5a204,electronics,clocks,clocks,1.5


In [None]:
# 대분류가 해당되는 것만 잘라서 사용

df_cat = df_v_m.loc[(df_v_m['division1']=='electronics')|
                    (df_v_m['division1']=='unknown')|
                    (df_v_m['division1']=='appliances')|
                    (df_v_m['division1']=='computers')|
                    (df_v_m['division1']=='apparel')|
                    (df_v_m['division1']=='furniture')
                    ]


In [None]:
df_cat=df_cat.reset_index(drop=True)
df_cat



Unnamed: 0.1,Unnamed: 0,event_time,event_type_x,product_id,category_id,category_code,brand,price,user_id,user_session,division1,division2,division3,event_type_y
0,0,2019-10-01 00:00:00+00:00,view,44600062,2103807459595387724,unknown.unknown.unknown,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,unknown,unknown,unknown,3.4
1,1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.20,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,appliances,environment,water_heater,1.5
2,2,2019-10-01 00:00:01+00:00,view,17200506,2053013559792632471,furniture.living_room.sofa,unknown,543.10,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,furniture,living_room,sofa,1.5
3,3,2019-10-01 00:00:01+00:00,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,computers,notebook,notebook,1.5
4,4,2019-10-01 00:00:04+00:00,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,electronics,smartphone,smartphone,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35960178,42448758,2019-10-31 23:59:58+00:00,view,2702331,2053013563911439225,appliances.kitchen.refrigerators,lg,527.43,524356542,153f9818-4d32-4e8b-ba9f-f355094e8ae4,appliances,kitchen,refrigerators,1.5
35960179,42448759,2019-10-31 23:59:58+00:00,view,2300275,2053013560530830019,electronics.camera.video,gopro,527.40,537931532,22c57267-da98-4f28-9a9c-18bb5b385193,electronics,camera,video,1.5
35960180,42448760,2019-10-31 23:59:58+00:00,view,10800172,2053013554994348409,unknown.unknown.unknown,redmond,61.75,527322328,5054190a-46cb-4211-a8f1-16fc1a060ed8,unknown,unknown,unknown,1.5
35960181,42448762,2019-10-31 23:59:59+00:00,view,21407424,2053013561579406073,electronics.clocks,tissot,689.85,513118352,4c14bf2a-2820-4504-929d-046356a5a204,electronics,clocks,clocks,1.5


In [None]:
df_cat = df_cat.rename(columns={'event_type_y': 'rating'})

In [None]:
df_cat = df_cat.drop(['event_time','event_type_x','category_code','user_session','division3'], axis=1)

# **함수화 진행**

In [4]:
import pandas as pd
import numpy as np
# 함수

def Surprise_dataset(df, number):
    # 칼럼 평점화 진행
    df_v = df.copy()
    # 필요한 컬럼만 사용
    df_v = df_v[['user_id','category_id','event_type']]

    # 중복값 제거
    df_v = df_v.drop_duplicates()
    df_v.event_type.value_counts()

    # event_type에 따른 평점 분류
    # 참고 : https://www.kaggle.com/code/gspmoreira/recommender-systems-in-python-101/notebook.
    event_type_strength = {
      'view': 1.5,
      'cart': 1.6, 
      'purchase': 1.9
     }
    df_v['event_type'] = df_v['event_type'].apply(lambda x: event_type_strength[x])

    # 선호도 확인을 위한 그룹화 진행
    df_v_g = df_v.groupby(['user_id','category_id'])['event_type'].sum().reset_index()

    # 초기 데이터셋에 평점 붙이기
    df_v_m = pd.merge(left=df, right=df_v_g,how='left',\
                        on=['user_id','category_id'],sort=False)

    # 대분류가 해당되는 것만 잘라서 사용
    df_cat = df_v_m.loc[(df_v_m['division1']=='electronics')|
                        (df_v_m['division1']=='unknown')|
                        (df_v_m['division1']=='appliances')|
                        (df_v_m['division1']=='computers')|
                        (df_v_m['division1']=='apparel')|
                        (df_v_m['division1']=='furniture')
                        ]
    # 인덱스 리셋
    df_cat=df_cat.reset_index(drop=True)

    # 칼럼 이름 변경
    df_cat = df_cat.rename(columns={'event_type_y': 'rating'})
    # 필요없는 칼럼 제거
    df_cat = df_cat.drop(['event_time','event_type_x','user_session'], axis=1)
    return df_cat.to_csv('last' + str(number) + '.csv', mode = 'w', index=False)
    

In [6]:
# CP_EDA.ipynb 에서 만든 df2(로그수 20이상, 구입X), df3(구입O) 활용

df2 = pd.read_csv('/content/drive/MyDrive/df2.csv')
df3 = pd.read_csv('/content/drive/MyDrive/df3.csv')

In [None]:
df2



Unnamed: 0,event_time,event_type,product_id,category_id,brand,price,user_id,user_session,division1,division2
0,2019-10-01 00:00:01+00:00,view,1307067,2053013558920217191,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,computers,notebook
1,2019-10-01 00:00:04+00:00,view,1004237,2053013555631882655,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,electronics,smartphone
2,2019-10-01 00:00:13+00:00,view,3900746,2053013552326770905,haier,102.38,555444559,98b88fa0-d8fa-4b9d-8a71-3dd403afab85,appliances,environment
3,2019-10-01 00:00:19+00:00,view,1306631,2053013558920217191,hp,580.89,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,computers,notebook
4,2019-10-01 00:00:19+00:00,view,1005135,2053013555631882655,apple,1747.79,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,electronics,smartphone
...,...,...,...,...,...,...,...,...,...,...
15528267,2019-10-31 23:59:58+00:00,view,2702331,2053013563911439225,lg,527.43,524356542,153f9818-4d32-4e8b-ba9f-f355094e8ae4,appliances,kitchen
15528268,2019-10-31 23:59:58+00:00,view,2300275,2053013560530830019,gopro,527.40,537931532,22c57267-da98-4f28-9a9c-18bb5b385193,electronics,camera
15528269,2019-10-31 23:59:58+00:00,view,10800172,2053013554994348409,redmond,61.75,527322328,5054190a-46cb-4211-a8f1-16fc1a060ed8,unknown,unknown
15528270,2019-10-31 23:59:59+00:00,view,21407424,2053013561579406073,tissot,689.85,513118352,4c14bf2a-2820-4504-929d-046356a5a204,electronics,clocks


In [7]:
# 데이터셋 평점화 작업
Surprise_dataset(df2, 2)
Surprise_dataset(df3, 3)

In [None]:
df2_2 = pd.read_csv('/content/last2.csv')

In [None]:
df2_2



Unnamed: 0,product_id,category_id,brand,price,user_id,division1,division2,rating
0,1307067,2053013558920217191,lenovo,251.74,550050854,computers,notebook,1.5
1,1004237,2053013555631882655,apple,1081.98,535871217,electronics,smartphone,1.5
2,3900746,2053013552326770905,haier,102.38,555444559,appliances,environment,1.5
3,1306631,2053013558920217191,hp,580.89,550050854,computers,notebook,1.5
4,1005135,2053013555631882655,apple,1747.79,535871217,electronics,smartphone,1.5
...,...,...,...,...,...,...,...,...
14333031,2702331,2053013563911439225,lg,527.43,524356542,appliances,kitchen,1.5
14333032,2300275,2053013560530830019,gopro,527.40,537931532,electronics,camera,1.5
14333033,10800172,2053013554994348409,redmond,61.75,527322328,unknown,unknown,1.5
14333034,21407424,2053013561579406073,tissot,689.85,513118352,electronics,clocks,1.5


In [None]:
df3_3 = pd.read_csv('/content/last3.csv')
df3_3



Unnamed: 0,product_id,category_id,brand,price,user_id,division1,division2,rating
0,44600062,2103807459595387724,shiseido,35.79,541312140,unknown,unknown,1.5
1,1480613,2053013561092866779,pulser,908.62,512742880,computers,desktop,1.5
2,44600062,2103807459595387724,shiseido,35.79,541312140,unknown,unknown,1.5
3,1480714,2053013561092866779,pulser,921.49,512742880,computers,desktop,1.5
4,1004739,2053013555631882655,xiaomi,197.55,519530528,electronics,smartphone,1.5
...,...,...,...,...,...,...,...,...
11122726,13104838,2053013553526341921,remain,207.98,525530673,unknown,unknown,1.5
11122727,1005014,2053013555631882655,samsung,503.09,533326659,electronics,smartphone,3.1
11122728,1002524,2053013555631882655,apple,531.26,565404816,electronics,smartphone,1.5
11122729,17200505,2053013559792632471,unknown,543.10,557880508,furniture,living_room,1.5
