In [1]:
import pandas as pd
import numpy as np
import openpyxl
from typing import List

### 주말 고객 데이터로 평일 고객 행동을 예측할 수 있게 샘플 데이터를 Raking하는 코드입니다.
- 해당 방법으로 평일 고객 별점 4.552510671877401이 예측되었고, 실제 별점은 4.430188656725918입니다.
- 계산된 weight matrix를 제시했습니다.

In [2]:
df = pd.read_excel('C:/2023_ADST_NaverPlaceReview.xlsx')
df

Unnamed: 0,place_id,name,store_category,user,user_review_cnt,user_photo_cnt,user_avg_rating,date,user_visit_cnt,review_type,review_category,rating
0,11707372,마포옥,"곰탕,설렁탕",화이팅,62,1,4.8,2021-02-06,1,영수증,전체,5.0
1,11707372,마포옥,"곰탕,설렁탕",heeeeeey,146,0,5.0,2021-02-05,1,영수증,전체,5.0
2,11707372,마포옥,"곰탕,설렁탕",hjian1967,55,0,4.8,2021-01-31,1,영수증,전체,5.0
3,11707372,마포옥,"곰탕,설렁탕",이마리,37,4,4.6,2021-01-30,1,영수증,전체,5.0
4,11707372,마포옥,"곰탕,설렁탕",karwp,821,0,3.9,2021-01-29,1,영수증,전체,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
364217,1395030685,옹근달,"카페,디저트",하루,32,10,3.8,2020-05-19,1,영수증,위치,4.5
364218,1395030685,옹근달,"카페,디저트",박숴,844,383,4.3,2020-08-22,1,영수증,전망,4.0
364219,1395030685,옹근달,"카페,디저트",태히,284,5,3.8,2020-08-06,1,영수증,전망,3.0
364220,1395030685,옹근달,"카페,디저트",sh6230,14,4,3.5,2020-07-03,1,영수증,전망,5.0


In [3]:
df.columns

Index(['place_id', 'name', 'store_category', 'user', 'user_review_cnt',
       'user_photo_cnt', 'user_avg_rating', 'date', 'user_visit_cnt',
       'review_type', 'review_category', 'rating'],
      dtype='object')

In [4]:
#범주형 값 치환
df=df.assign(review_grouped= np.where(df['review_type']=='영수증', 0, 
                                     np.where(df['review_type']=='주문',1,2)))

df=df.assign(avg_grouped = np.where(df['user_avg_rating']==5, 5, 
                                 np.where(df['user_avg_rating']>=4, 4,
                                          np.where(df['user_avg_rating']>=3, 3,
                                                   np.where(df['user_avg_rating']>=2, 2,
                                                           np.where(df['user_avg_rating']>=1, 1, 0))))))

In [5]:
print(df['review_grouped'].unique())
print(df['avg_grouped'].unique())

[0 1 2]
[4 5 3 2 0 1]


In [6]:
df['date'] = pd.to_datetime(df['date'])

# 주말과 평일 구분
group_s = df[df['date'].dt.dayofweek.isin([5, 6])]  # 토요일(5) 또는 일요일(6)
group_t = df[~df['date'].dt.dayofweek.isin([5, 6])]  # 토요일(5) 또는 일요일(6)이 아닌 날

# covariates 설정
z_s = group_s[['avg_grouped','review_grouped']]
z_t = group_t[['avg_grouped','review_grouped']]

In [7]:
# 그룹S의 weight 1회 조정하는 함수 정의
def sraking(w: List[float], 
            dat: List[int], 
            pdist: List[float]) -> List[float]:
    
    n = len(w)
    nu = len(pdist)
    frac = np.sum(w) / np.sum(pdist)
    ratio = np.array(pdist) * frac

    subtotal = np.zeros(nu)
    for i in range(nu):
        subtotal[i] = np.sum(w[dat == i])

    fw = np.zeros(n)
    for j in range(n):
        v = dat[j] - 1
        fw[j] = w[j] * ratio[v] / subtotal[v]

    return fw

In [8]:
#그룹 T에서 reference로 사용할 marginal total 비율을 구하는 함수 정의
def ref_marginal(dat):
    uq_prop = []
    for i in dat.columns:
        uq_prop.append(np.array(z_t[i].value_counts().sort_index()))
    return uq_prop

In [9]:
z_1 = np.array(z_s['avg_grouped'])
z_2 = np.array(z_s['review_grouped'])

ref_z_1 = ref_marginal(z_t)[0]
ref_z_2 = ref_marginal(z_t)[1]

print(ref_z_1)
print(ref_z_2)

[   161    287   2750  40185 161366  46607]
[219393  16721  15242]


In [10]:
w = np.ones(len(z_s))

for j in range(5):
    w = sraking(w, z_1, ref_z_1)
    w = sraking(w, z_2, ref_z_2)
    
A = np.array(group_s['rating'])
print(f"평균 별점은 {np.average(A,weights=w)}")


평균 별점은 4.552510671877401


In [11]:
B = np.array(group_t['rating'])
np.average(B)

4.430188656725918

In [12]:
z_s_w = z_s.assign(w = w)
z_s_w

Unnamed: 0,avg_grouped,review_grouped,w
0,4,0,2.511963
2,4,0,2.511963
3,4,0,2.511963
6,3,0,0.000456
7,3,0,0.000456
...,...,...,...
364213,4,0,2.511963
364214,4,0,2.511963
364216,4,0,2.511963
364218,4,0,2.511963


In [13]:
z_s_w = z_s.assign(w = w)
table = pd.pivot_table(z_s_w, 
                       values='w', 
                       index=['avg_grouped'], 
                       columns=['review_grouped'], 
                       aggfunc=np.mean, 
                       fill_value=0)
table

review_grouped,0,1,2
avg_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.027258,27.332416,0.016912
1,0.00279,2.797729,0.001731
2,3.895097,3905.69594,2.416729
3,0.000456,0.457621,0.000283
4,2.511963,2518.798244,1.558558
5,0.015113,15.153832,0.009377
