# DATA Preprocessing & Personalize Test ver 1
* 정기 일정 : 22/11/24 ~ 22/12/13
* 작성자 : 전유빈

## DATA LOAD

In [16]:
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

raw_item = pd.read_csv('s3://poc-descente/raw_data/item.csv', encoding='cp949')
raw_order = pd.read_csv('s3://poc-descente/raw_data/order.csv', encoding='cp949')
raw_users = pd.read_csv('s3://poc-descente/raw_data/users_1125.csv', encoding='cp949')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Data Filtering
#### 고객사 협의사항
- Brand : Y (영애슬릿) 현재 운영하지 않으므로 제외
- 인당 구매 개수 비정상 고객 ID (비회원 구매) 제외
- PROD_CD : 첫글자 7(17년도)이하 및 TK 상품 제외
- 가격 0 인 제품 제외
- 상품 코드가 아닌 상품 명으로 추천

In [17]:
## item
item = raw_item[raw_item['ITEM_NM'] != 'TICKET'] ## TK(TICKET)상품 제외 (가격 0인 제품도 같이 제외되었음)
item = item[item['PROD_CD'].str.split('',n = 3, expand = True)[2].isin(['0','M','N','O'])] ## 연도 정보 0, M, N, O에 해당되는 데이터만 추출
print(item.shape)

## user
users = raw_users[raw_users['AGE'] != 2022]  ## 이상치 데이터 제외
print(users.shape)

## order
raw_order['BRAND_CD'] = raw_order['PROD_CD'].str.split('', n=2, expand=True)[1] ## 브랜드코드 추출
order = raw_order[raw_order['BRAND_CD']!='Y'] ## 브랜드 Y(영애슬릿) 제외
order = order[order['USR_ID']!=raw_order['USR_ID'].value_counts().head(1).index[0]] ## 인당 구매개수 비정상 고객(비회원 구매) 제외
order = order[order['PROD_CD'].str[:2] != 'TK'] ## TK(TICKET)상품 제외
order = order[order['PROD_CD'] != 2800] ## 이상치 데이터 제외

print(order.shape)

(25780, 13)
(482809, 10)
(371863, 11)


#### Personalize 요구사항
- 1,000개 이상의 Interaction 데이터셋 (충족 완료)
- 최소 25개의 UserID(충족 완료)
- UserID당 최소 2개의 상호작용 필요 (필터링 진행)

1) 전체 데이터셋 학습

2) 추천모델 고려사항 반영
* 인당 5번 구매 미만 고객 제외 후 진행
* 동시 구매 된 아이템 건수 5회 미만일 경우 제외
3) 교집합 데이터만 학습

## 기본 전처리

In [18]:
order['REG_DT'] = pd.to_datetime(order['REG_DT'])
item['REG_DT'] = pd.to_datetime(item['REG_DT'])
users['REG_DT'] = pd.to_datetime(users['REG_DT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## 전체 데이터셋 학습 (order 데이터)

In [5]:
total_order = pd.DataFrame()

total_order['TIMESTAMP'] = [datetime.datetime.timestamp(i) for i in order['REG_DT']]
total_order['TIMESTAMP'] = total_order['TIMESTAMP'].astype('long')
total_order['USER_ID'] = order['USR_ID'].astype('str')
total_order['ITEM_ID'] = order['PROD_CD'].astype('str')
total_order['EVENT_TYPE'] = 'Purchase'
total_order.to_csv('s3://poc-descente/train_data/interactions/order_cd_1.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_2 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 2].index)].copy()
total_order_2.to_csv('s3://poc-descente/train_data/interactions/order_cd_2.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_5 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 5].index)].copy()
total_order_5.to_csv('s3://poc-descente/train_data/interactions/order_cd_5.csv')

In [6]:
order = pd.merge(item[['PROD_CD','PROD_NM']], order, on = 'PROD_CD', how = 'right') ## 상품명 가져오기
order = order[~order['PROD_NM'].isna()] ## 상품명이 매핑되지 않는 데이터 11,893건 삭제

total_order = pd.DataFrame()

total_order['TIMESTAMP'] = [datetime.datetime.timestamp(i) for i in order['REG_DT']]
total_order['TIMESTAMP'] = total_order['TIMESTAMP'].astype('long')
total_order['USER_ID'] = order['USR_ID'].astype('str')
total_order['ITEM_ID'] = order['PROD_NM'].astype('str')
total_order['EVENT_TYPE'] = 'Purchase'
total_order.to_csv('s3://poc-descente/train_data/interactions/order_nm_1.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_2 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 2].index)].copy()
total_order_2.to_csv('s3://poc-descente/train_data/interactions/order_nm_2.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_5 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 5].index)].copy()
total_order_5.to_csv('s3://poc-descente/train_data/interactions/order_nm_5.csv')

## 전체 데이터 학습 (user 데이터)

In [7]:
users['GRADE'] = users['DST_GRADE'] + ' | ' +users['LCS_GRADE'] + ' | ' + users['UMB_GRADE'] + ' | ' +users['DSG_GRADE'] + ' | ' + users['LCG_GRADE'] + ' | ' +users['MSW_GRADE']
total_users = users[['USR_ID', 'GENDER', 'AGE', 'GRADE']].copy()
total_users.rename({'USR_ID' : 'USER_ID'}, axis = 1, inplace = True)
total_users.to_csv('s3://poc-descente/train_data/users/users_1.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
total_users_60 = total_users[total_users['AGE'] < 60].copy()
total_users_60.to_csv('s3://poc-descente/train_data/users/users_60.csv')

## 전체 데이터 학습 (item 데이터)

In [19]:
item['CATEGORY_S'] = item['CATEGORY_S'].str.replace(',','|')
item['CATEGORY_S'] = item['CATEGORY_S'].str.replace('/','|')
item['CATEGORY_S'].fillna("",inplace = True)
item['CATEGORY_S'] = ['|'.join(list(set(i))) for i in list(item['CATEGORY_S'].str.split("|"))]
    

In [78]:
# # item['ITEM_NM'] = np.where(item['ITEM_NM'] == '신발', 'SHOES', item['ITEM_NM'])
# item['ITEM_NM'] = np.where(item['ITEM_NM'].isin(['캐디백', '트롤리백', '보스턴백', '미니백/파우치',
#                                                  'BACKPACK', 'TROLLY BOSTON BAG','HALF BAG',
#                                                  'CADDYBAG', 'BOSTONBAG']), 'BAG', item['ITEM_NM'])
# item['ITEM_NM'] = np.where(item['ITEM_NM'].isin(['캐디백', '트롤리백', '보스턴백', '미니백/파우치',
#                                                  'BACKPACK', 'TROLLY BOSTON BAG','HALF BAG',
#                                                  'CADDYBAG', 'BOSTONBAG']), 'BAG', item['ITEM_NM'])

## Personalize

In [50]:
import boto3
## 모델 총 3 + 8 = 11개
recipeArn_lst = [ ## recommender는 HPO 없음
    'arn:aws:personalize:::recipe/aws-ecomm-popular-items-by-purchases', ## 고객이 품목을 구매한 횟수를 기준으로 아이템 추천 (userid만 사용, itemid 사용하지 않음)
    'arn:aws:personalize:::recipe/aws-ecomm-frequently-bought-together', ## 고객이 자주 구매하는 아이템과 셀러가 지정한 아이템 추천(userid, itemid 모두 사용)
    'arn:aws:personalize:::recipe/aws-ecomm-recommended-for-you' ## 지정된 사용자 기반으로 항목에 대한 개인화된 추천. 
                                                                ## Amazon Personalize Personalize가 지정한 userId 기준으로 사용자가 구매한 품목을 자동으로 필터링
                                                                ## 사용자 기반으로 개인화 된 추천 (View 데이터가 있을 경우 성능이 더 높음)
]

solution_lst = [
    'arn:aws:personalize:::recipe/aws-user-personalization', ## 맞춤형 추천 시나리오 (Interaction, users, item 데이터 모두 사용)
    'arn:aws:personalize:::recipe/aws-popularity-count', ## 모든 사용자의 행동 데이터 기반으로 가장 인기있는 항목 추천 (사용자와 상호작용이 가장 많은 품목) HPO 적용 불가
    # 'arn:aws:personalize:::recipe/aws-hrnn', ## 계층적 반복 신경망으로 콘솔에서는 더이상 지원하지 않음, hrnn의 업그레이드 버전이 user-personalization
    'arn:aws:personalize:::recipe/aws-hrnn-metadata', ## 고품질 메타데이터를 사용할 수 있을 때 비 메타데이터 모델보다 정확도가 높음
    'arn:aws:personalize:::recipe/aws-hrnn-coldstart', ## 인기 추세를 파악하여 관련성이 적은 항목을 필터링으로 제외 (item dataset 필요)
    'arn:aws:personalize:::recipe/aws-personalized-ranking', ## 아이템의 순위를 정하여 추천하며 비정형 텍스트 메타데이터도 사용 가능 (한국어 지원 불가)
    'arn:aws:personalize:::recipe/aws-similar-items', ## 지정한 항목과 유사한 항목에 대한 추천
    # 'arn:aws:personalize:::recipe/aws-sims' ## sims의 업그레이드 버전이 similar-items
]
client = boto3.client('personalize')

In [51]:
data_name = 'order_nm_5'
base_name = f'{data_name}_recom'
num = 0
for model in recipeArn_lst:
    try:
        print(model.split('/')[-1])
        response = client.create_recommender(
            name=f'{base_name}_{num}',
            datasetGroupArn=f'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/{data_name}',
            recipeArn= model
        )
        num += 1
    except Exception as e:
        num += 1
        print(e)
        pass
    
for solution in solution_lst:
    try:
        print(solution.split('/')[-1])
        if solution.split('/')[-1] == 'aws-popularity-count':
            hpo_mode = False
        else:
            hpo_mode = True
        response = client.create_solution(
            name = f'{base_name}_{num}',
            performHPO = hpo_mode,
            recipeArn = solution,
            datasetGroupArn = f'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/{data_name}')
        
        response = client.create_solution_version(
            solutionArn=f'arn:aws:personalize:ap-northeast-2:217278714774:solution/{base_name}_{num}',
            trainingMode='FULL'    
        )
        num += 1
    except Exception as e:
        print(e)
        num += 1


aws-ecomm-popular-items-by-purchases
aws-ecomm-frequently-bought-together
aws-ecomm-recommended-for-you
aws-user-personalization
aws-popularity-count
aws-hrnn-metadata
aws-hrnn-coldstart
An error occurred (InvalidInputException) when calling the CreateSolution operation: This recipe requires an ITEMS dataset. Please update your datasetGroup or choose another recipe.
aws-personalized-ranking
aws-similar-items


In [57]:
data = []
model = []
ndcg_5 = []
ndcg_10 = []
ndcg_25 = []
precision_5 = []
precision_10 = []
precision_25 = []
coverage = []

for results in client.list_recommenders()['recommenders']:
    response = client.describe_recommender(recommenderArn=results['recommenderArn'])
    data.append('_'.join(results['name'].split('_')[:3]))
    model.append(results['recipeArn'].split('/')[-1])
    ndcg_5.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_5'])
    ndcg_10.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_10'])
    ndcg_25.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_25'])
    precision_5.append(response['recommender']['modelMetrics']['precision_at_5'])
    precision_10.append(response['recommender']['modelMetrics']['precision_at_10'])
    precision_25.append(response['recommender']['modelMetrics']['precision_at_25'])
    coverage.append(response['recommender']['modelMetrics']['coverage'])
    
for results in client.list_solution_versions()['solutionVersions']:
    try:
        response = client.get_solution_metrics(solutionVersionArn=results['solutionVersionArn'])
        ndcg_5.append(response['metrics']['normalized_discounted_cumulative_gain_at_5'])
        ndcg_10.append(response['metrics']['normalized_discounted_cumulative_gain_at_10'])
        ndcg_25.append(response['metrics']['normalized_discounted_cumulative_gain_at_25'])
        precision_5.append(response['metrics']['precision_at_5'])
        precision_10.append(response['metrics']['precision_at_10'])
        precision_25.append(response['metrics']['precision_at_25'])
        coverage.append(response['metrics']['coverage'])
        response = client.describe_solution_version(solutionVersionArn=results['solutionVersionArn'])
        data.append('_'.join(response['solutionVersion']['solutionVersionArn'].split('/')[-2].split('_')[:3]))
        model.append(response['solutionVersion']['recipeArn'].split('/')[-1])       
    except Exception as e:
        print(results['solutionVersionArn'])
        print(e)
    
for results in client.list_solution_versions(nextToken = client.list_solution_versions()['nextToken'])['solutionVersions']:
        response = client.get_solution_metrics(solutionVersionArn=results['solutionVersionArn'])
        ndcg_5.append(response['metrics']['normalized_discounted_cumulative_gain_at_5'])
        ndcg_10.append(response['metrics']['normalized_discounted_cumulative_gain_at_10'])
        ndcg_25.append(response['metrics']['normalized_discounted_cumulative_gain_at_25'])
        precision_5.append(response['metrics']['precision_at_5'])
        precision_10.append(response['metrics']['precision_at_10'])
        precision_25.append(response['metrics']['precision_at_25'])
        coverage.append(response['metrics']['coverage'])
        response = client.describe_solution_version(solutionVersionArn=results['solutionVersionArn'])
        data.append('_'.join(response['solutionVersion']['solutionVersionArn'].split('/')[-2].split('_')[:3]))
        model.append(response['solutionVersion']['recipeArn'].split('/')[-1])

In [59]:
results_df = pd.DataFrame(columns = ['data', 'model','ndcg_5','ndcg_10','ndcg_25','precision_5','precision_10','precision_25','coverage'])

for i in list(results_df):
    results_df[i] = globals()[i]

In [63]:
results_df[results_df['data'] == 'order_nm_5'].sort_values('model').to_csv('temp.csv')

In [52]:
# results_df.to_csv('order_results.csv')

In [65]:
## 리소스 정리하기
data_lst = client.list_dataset_groups()

for data_group in data_lst['datasetGroups']:
    try:
        recom_lst = client.list_recommenders(datasetGroupArn = data_group['datasetGroupArn'])['recommenders']
        for recom in recom_lst: 
            try: 
                client.stop_recommender(recommenderArn=recom['recommenderArn']) 
            except Exception as e:
                # print(e)
                pass
        
#         for recom in recom_lst: client.delete_recommender(recommenderArn=recom['recommenderArn'])     
    except:
        pass
#     try:    
#         sol_lst = client.list_solutions(datasetGroupArn = data_group['datasetGroupArn'])['solutions']
#         for sol in sol_lst: client.delete_solution(solutionArn=sol['solutionArn'])        
#     except:
#         pass
#     try:        
#         dataset_lst = client.list_datasets(datasetGroupArn = data_group['datasetGroupArn'])['datasets']                           
#         for dataset in dataset_lst: client.delete_dataset(datasetArn=dataset['datasetArn'])                
#     except:
#         pass
#     try:
#         client.delete_dataset_group(datasetGroupArn=data_group['datasetGroupArn'])
#     except:
#         pass