# DATA Preprocessing & Personalize Test ver 1
* 정기 일정 : 22/11/24 ~ 22/12/13
* 작성자 : 전유빈

## DATA LOAD

In [2]:
import pandas as pd
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import boto3
client = boto3.client('personalize')

raw_order = pd.read_csv('s3://poc-descente/raw_data/order.csv', encoding='cp949')
raw_users = pd.read_csv('s3://poc-descente/raw_data/users_1125.csv', encoding='cp949')
raw_item = pd.read_csv('s3://poc-descente/raw_data/item_기타항목 분류 추가.csv', encoding='cp949')
item_meta = pd.read_excel('s3://poc-descente/raw_data/데상트 아이템 분류.xlsx', header=2)

## Data Filtering
#### 고객사 협의사항
- Brand : Y (영애슬릿) 현재 운영하지 않으므로 제외
- 인당 구매 개수 비정상 고객 ID (비회원 구매) 제외
- PROD_CD : 첫글자 7(17년도)이하 및 TK 상품 제외
- 가격 0 인 제품 제외
- 상품 코드가 아닌 상품 명으로 추천

In [243]:
## item
item = raw_item[raw_item['ITEM_NM'] != 'TICKET'] ## TK(TICKET)상품 제외 (가격 0인 제품도 같이 제외되었음)
item = item[item['PROD_CD'].str.split('',n = 3, expand = True)[2].isin(['0','M','N','O'])] ## 연도 정보 0, M, N, O에 해당되는 데이터만 추출
print(item.shape)

## user
users = raw_users[raw_users['AGE'] != 2022]  ## 이상치 데이터 제외
print(users.shape)

## order
raw_order['BRAND_CD'] = raw_order['PROD_CD'].str.split('', n=2, expand=True)[1] ## 브랜드코드 추출
order = raw_order[raw_order['BRAND_CD']!='Y'] ## 브랜드 Y(영애슬릿) 제외
order = order[order['USR_ID']!=raw_order['USR_ID'].value_counts().head(1).index[0]] ## 인당 구매개수 비정상 고객(비회원 구매) 제외
order = order[order['PROD_CD'].str[:2] != 'TK'] ## TK(TICKET)상품 제외
order = order[order['PROD_CD'] != 2800] ## 이상치 데이터 제외

print(order.shape)

(25780, 13)
(482809, 10)
(371863, 11)


#### Personalize 요구사항
- 1,000개 이상의 Interaction 데이터셋 (충족 완료)
- 최소 25개의 UserID(충족 완료)
- UserID당 최소 2개의 상호작용 필요 (필터링 진행)

1) 전체 데이터셋 학습

2) 추천모델 고려사항 반영
* 인당 5번 구매 미만 고객 제외 후 진행
* 동시 구매 된 아이템 건수 5회 미만일 경우 제외
3) 교집합 데이터만 학습

## 기본 전처리

In [244]:
order['REG_DT'] = pd.to_datetime(order['REG_DT'])
item['REG_DT'] = pd.to_datetime(item['REG_DT'])
users['REG_DT'] = pd.to_datetime(users['REG_DT'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## 전체 데이터셋 학습 (order 데이터)

In [61]:
total_order = pd.DataFrame()

total_order['TIMESTAMP'] = [datetime.datetime.timestamp(i) for i in order['REG_DT']]
total_order['TIMESTAMP'] = total_order['TIMESTAMP'].astype('long')
total_order['USER_ID'] = order['USR_ID'].astype('str')
total_order['ITEM_ID'] = order['PROD_CD'].astype('str')
total_order['EVENT_TYPE'] = 'Purchase'
total_order.to_csv('s3://poc-descente/train_data/interactions/order_cd_1.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_2 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 2].index)].copy()
total_order_2.to_csv('s3://poc-descente/train_data/interactions/order_cd_2.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_5 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 5].index)].copy()
total_order_5.to_csv('s3://poc-descente/train_data/interactions/order_cd_5.csv')

In [62]:
order = pd.merge(item[['PROD_CD','PROD_NM']], order, on = 'PROD_CD', how = 'right') ## 상품명 가져오기
order = order[~order['PROD_NM'].isna()] ## 상품명이 매핑되지 않는 데이터 11,893건 삭제

total_order = pd.DataFrame()

total_order['TIMESTAMP'] = [datetime.datetime.timestamp(i) for i in order['REG_DT']]
total_order['TIMESTAMP'] = total_order['TIMESTAMP'].astype('long')
total_order['USER_ID'] = order['USR_ID'].astype('str')
total_order['ITEM_ID'] = order['PROD_NM'].astype('str')
total_order['EVENT_TYPE'] = 'Purchase'
total_order.to_csv('s3://poc-descente/train_data/interactions/order_nm_1.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_2 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 2].index)].copy()
total_order_2.to_csv('s3://poc-descente/train_data/interactions/order_nm_2.csv')

temp = pd.DataFrame(total_order['USER_ID'].value_counts())
total_order_5 = total_order[total_order['USER_ID'].isin(temp[temp['USER_ID'] >= 5].index)].copy()
total_order_5.to_csv('s3://poc-descente/train_data/interactions/order_nm_5.csv')

## 전체 데이터 학습 (user 데이터)

In [338]:
users

Unnamed: 0,USR_ID,GENDER,AGE,DST_GRADE,LCS_GRADE,UMB_GRADE,DSG_GRADE,LCG_GRADE,MSW_GRADE,REG_DT
0,6WHuuJZ3nR2vwlBeY7GLAg==,W,74,bronze,vip,bronze,bronze,bronze,bronze,2021-05-04
1,rhvPq0R8ONQjwtyu+iZGcQ==,W,63,bronze,bronze,bronze,bronze,bronze,bronze,2021-05-04
2,6IcDN5NhZp+QwBR4ACAgaA==,M,34,bronze,bronze,bronze,bronze,bronze,bronze,2021-05-04
3,1oRStZxM2T8enn5AFaIe4Q==,W,45,bronze,bronze,bronze,bronze,bronze,bronze,2021-05-04
4,qkMdfOuMizhZNDNcN/Ta8w==,W,47,bronze,bronze,bronze,bronze,bronze,bronze,2021-05-04
...,...,...,...,...,...,...,...,...,...,...
482846,XyTZhY+n6MOAiLkWxjO7XQ==,W,50,bronze,bronze,bronze,bronze,bronze,bronze,2022-11-23
482847,MLhm0/SNAQRGoz58OPssXw==,W,34,bronze,bronze,bronze,bronze,bronze,bronze,2022-11-23
482848,fFgdB2D8RxHsWxox8Z9KCw==,W,62,bronze,bronze,bronze,bronze,bronze,bronze,2022-11-23
482849,j2vqG2b7KGBYTZe7zMHANw==,W,38,bronze,bronze,bronze,bronze,bronze,bronze,2022-11-23


In [137]:
users['GRADE'] = users['DST_GRADE'] + ' | ' +users['LCS_GRADE'] + ' | ' + users['UMB_GRADE'] + ' | ' +users['DSG_GRADE'] + ' | ' + users['LCG_GRADE'] + ' | ' +users['MSW_GRADE']
total_users = users[['USR_ID', 'GENDER', 'AGE', 'GRADE']].copy()
total_users.rename({'USR_ID' : 'USER_ID'}, axis = 1, inplace = True)
total_users.to_csv('s3://poc-descente/train_data/users/users_1.csv')

In [157]:
total_users.shape

(482809, 4)

In [138]:
total_users_60 = total_users[total_users['AGE'] < 60].copy()
total_users_60.to_csv('s3://poc-descente/train_data/users/users_60.csv')

In [158]:
total_users_60.shape

(457059, 4)

## 전체 데이터 학습 (item 데이터)
* 상세 > 아이템명 > CATEGORY_S

In [245]:
item['아이템코드'] = item['PROD_CD'].str.split('',expand = True)[7] + item['PROD_CD'].str.split('',expand = True)[8]
item['브랜드'] = item['PROD_CD'].str.split('',expand = True)[1]
item_join = pd.merge(item, item_meta, how = 'left', left_on = ['아이템코드', '브랜드'], right_on = ['아이템코드', '브랜드'])

In [246]:
item_join[['PROD_NM','상세','아이템명','CATEGORY_S','GENDER','BRND_NM','MALL_SALE_PRC']].sort_values('MALL_SALE_PRC').shape

(25780, 7)

In [247]:
# 동일 PROD_NM, BRND_NM에 대해  다른 REG_DT와 MALL_SALE_PRC로 데이터 중복 적재
item_join[(item_join['PROD_NM'] == '메종 키즈') & (item_join['BRND_NM'] == '르꼬끄')][['PROD_NM','상세','아이템명','CATEGORY_S','GENDER','BRND_NM','MALL_SALE_PRC','REG_DT']].drop_duplicates()

Unnamed: 0,PROD_NM,상세,아이템명,CATEGORY_S,GENDER,BRND_NM,MALL_SALE_PRC,REG_DT
14397,메종 키즈,,,"르꼬끄 아동화,주니어",UNISEX,르꼬끄,27600,2020-07-01
14418,메종 키즈,,,"르꼬끄 아동화,주니어",UNISEX,르꼬끄,41400,2021-07-21
14420,메종 키즈,,,"르꼬끄 아동화,주니어",UNISEX,르꼬끄,41400,2021-01-04


In [248]:
## 등록 일자에 따라 CATEGORY_S의 값 또한 다름
item_join[(item_join['PROD_NM'] == '남녀공용 3X3 베이직 바람막이') & (item_join['BRND_NM'] == '데상트')][['PROD_NM','상세','아이템명','CATEGORY_S','GENDER','BRND_NM','MALL_SALE_PRC','REG_DT']].drop_duplicates()

Unnamed: 0,PROD_NM,상세,아이템명,CATEGORY_S,GENDER,BRND_NM,MALL_SALE_PRC,REG_DT
10832,남녀공용 3X3 베이직 바람막이,,WIND BREAK,"바람막이/자켓,재킷",UNISEX,데상트,101400,2020-02-05
18186,남녀공용 3X3 베이직 바람막이,,WIND BREAK,"농구,바람막이/자켓,재킷",UNISEX,데상트,71600,2020-07-20


In [249]:
# temp = item_join[item_join['상세'].isna()][['PROD_NM','BRND_NM','REG_DT']].drop_duplicates().value_counts(['PROD_NM','BRND_NM']).reset_index()
# temp.sort_values(0, inplace = True, ascending=False)
# temp[temp[0]>1].head()
# ###
# temp = item_join[item_join['상세'].isna()][['PROD_NM','BRND_NM','MALL_SALE_PRC']].drop_duplicates().value_counts(['PROD_NM','BRND_NM']).reset_index()
# temp.sort_values('PROD_NM', inplace = True, ascending=False)
# temp[temp[0]>1].head()

In [250]:
temp = item_join.groupby(['PROD_CD','BRND_NM','GENDER','상세','아이템명','CATEGORY_S', 'PROD_NM'])['REG_DT'].max().reset_index()
item_new = pd.merge(temp, item_join, how = 'left')
item_new.shape

(24944, 18)

In [251]:
item_rm_dupli = item_new[['PROD_NM','상세','아이템명','CATEGORY_S','GENDER','BRND_NM','MALL_SALE_PRC']].drop_duplicates()
item_rm_dupli.shape

(11464, 7)

In [252]:
item_rm_dupli.head(10)

Unnamed: 0,PROD_NM,상세,아이템명,CATEGORY_S,GENDER,BRND_NM,MALL_SALE_PRC
0,남성 기하학패턴 브이넥 니트,스웨터,KNIT PULLOVER,"니트,니트스웨터",MAN,데상트골프,143400
2,남성 조직감 변형 라운드넥 니트,스웨터,KNIT PULLOVER,"니트,니트스웨터",MAN,데상트골프,131400
4,남성 유니크넥 긴팔티셔츠,긴팔라운드티셔츠,LONG SLEEVE T SHIRT,긴팔티셔츠,MAN,데상트골프,119400
6,남성 싱귤러 메신저백,가방,BAG,파우치백,MAN,데상트골프,71400
7,남성 G-ARC 프리미엄 보아 다이얼 점퍼,자켓,BLOUSON,"바람막이,자켓,재킷",MAN,데상트골프,467400
9,남성 싱귤러 백팩,보스턴백,BOSTONBAG,보스턴백,MAN,데상트골프,149400
10,남성 MAMI 캘리그래피 스탠드 캐디백,캐디백,CADDYBAG,캐디백,MAN,데상트골프,299400
11,남성 싱귤러 컴팩트 투어백,캐디백,CADDYBAG,캐디백,MAN,데상트골프,311400
13,남성 하프 스탠드 캐디백,캐디백,CADDYBAG,캐디백,MAN,데상트골프,131400
15,남성 무브 스포츠 스탠드백,캐디백,CADDYBAG,캐디백,MAN,데상트골프,239400


In [254]:
## 공백값은 제거 후 진행
train_item = item_rm_dupli.copy()
print(train_item.shape)
train_item.dropna(axis = 0, inplace = True)
print(train_item.shape) ## 총 836개 제거
train_item['CATEGORY_S'] = train_item['CATEGORY_S'].str.replace(',','|')
train_item['CATEGORY_S'] = train_item['CATEGORY_S'].str.replace('/','|')
train_item['CATEGORY_S'] = ['|'.join(list(set(i))) for i in list(train_item['CATEGORY_S'].str.split("|"))]

train_item['아이템명'] = train_item['아이템명'].str.replace(' / ','|')
train_item['아이템명'] = train_item['아이템명'].str.replace('/','|')
train_item['아이템명'] = train_item['아이템명'].str.replace('  ',' ')

train_item['아이템명'] = ['|'.join(list(set(i))) for i in list(train_item['아이템명'].str.split("|"))]

(11464, 7)
(11464, 7)


In [257]:
total_item = pd.DataFrame()
total_item[['ITEM_ID','DETAIL','DETAIL_L2','DETAIL_L3','GENDER','BRND_NM','PRICE']] = train_item[['PROD_NM','상세','아이템명','CATEGORY_S','GENDER','BRND_NM','MALL_SALE_PRC']].copy()
total_item.to_csv('s3://poc-descente/train_data/item/item.csv')

### 데이터 필터링

In [3]:
total_order_5 = pd.read_csv('s3://poc-descente/train_data/interactions/order_nm_5.csv',index_col=False)
total_users_60 = pd.read_csv('s3://poc-descente/train_data/users/users_60.csv',index_col=False)
total_users_1 = pd.read_csv('s3://poc-descente/train_data/users/users_1.csv',index_col=False)
total_item = pd.read_csv('s3://poc-descente/train_data/item/item.csv',index_col=False)

total_order_5.drop('Unnamed: 0', inplace = True, axis = 1)
total_users_60.drop('Unnamed: 0', inplace = True, axis = 1)
total_users_1.drop('Unnamed: 0', inplace = True, axis = 1)
total_item.drop('Unnamed: 0', inplace = True, axis = 1)

print(f'order : {total_order_5.shape}')
print(f'users_1 : {total_users_1.shape}')
print(f'users_60 : {total_users_60.shape}')
print(f'item : {total_item.shape}')

order : (841327, 4)
users_1 : (482809, 4)
users_60 : (457059, 4)
item : (11464, 7)


In [34]:
(0.3116+0.2941+0.3549+0.2644+0.424)/5

0.3298

In [29]:
users_num = 60

### 교집합
total_order_5_filter = total_order_5[total_order_5['USER_ID'].isin(globals()[f'total_users_{users_num}']['USER_ID'])]
total_order_5_filter = total_order_5_filter[total_order_5_filter['ITEM_ID'].isin(total_item['ITEM_ID'])]

globals()[f'total_users_{users_num}_filter'] = globals()[f'total_users_{users_num}'][globals()[f'total_users_{users_num}']['USER_ID'].isin(total_order_5_filter['USER_ID'])]

total_item_filter = total_item[total_item['ITEM_ID'].isin(total_order_5_filter['ITEM_ID'])]

print(f'order : {total_order_5_filter.shape}')
print(f"users : {globals()[f'total_users_{users_num}_filter'].shape}")
print(f'item : {total_item_filter.shape}')

total_order_5_filter.to_csv(f's3://poc-descente/train_data/interactions/order_nm_5_user{users_num}_filter.csv')
globals()[f'total_users_{users_num}_filter'].to_csv(f's3://poc-descente/train_data/users/users_{users_num}_user{users_num}_filter.csv')
total_item_filter.to_csv(f's3://poc-descente/train_data/item/item_filter_user{users_num}_filter.csv')

### 5회 이상 구매된 제품 & 고객 필터링
temp_order = total_order_5_filter['ITEM_ID'].value_counts().reset_index()
total_order_5_filter_5 = total_order_5_filter[total_order_5_filter['ITEM_ID'].isin(temp_order[temp_order['ITEM_ID']>5]['index'])]

globals()[f'total_users_{users_num}_filter_5'] = globals()[f'total_users_{users_num}_filter'][globals()[f'total_users_{users_num}_filter']['USER_ID'].isin(total_order_5_filter_5['USER_ID'])]
total_item_filter_5 = total_item_filter[total_item_filter['ITEM_ID'].isin(total_order_5_filter_5['ITEM_ID'])]

print(f'order : {total_order_5_filter_5.shape}')
print(f"users : {globals()[f'total_users_{users_num}_filter_5'].shape}")
print(f'item : {total_item_filter_5.shape}')

total_order_5_filter_5.to_csv(f's3://poc-descente/train_data/interactions/order_nm_5_user{users_num}_filter_5.csv')
globals()[f'total_users_{users_num}_filter_5'].to_csv(f's3://poc-descente/train_data/users/users_{users_num}_user{users_num}_filter_5.csv')
total_item_filter_5.to_csv(f's3://poc-descente/train_data/item/item_filter_user{users_num}_filter_5.csv')

order : (796957, 4)
users : (58299, 4)
item : (7376, 7)
order : (793564, 4)
users : (58295, 4)
item : (6073, 7)


## Personalize

In [None]:
## 최종 모델 적용은 total_order_5 / aws_user_personalization

### 지정 모델 학습

In [27]:
roop = True
while roop == True:    
    solution_lst = [
        'arn:aws:personalize:::recipe/aws-user-personalization', ## 맞춤형 추천 시나리오 (Interaction, users, item 데이터 모두 사용)
    ]
    data_name_list = [i['name'] for i in client.list_dataset_groups()['datasetGroups']]

    for data_name in data_name_list:
        print(data_name)
        try:
            hpo_mode = True
            response = client.create_solution(
                name = data_name,
                performHPO = hpo_mode,
                recipeArn = solution_lst[0],
                datasetGroupArn = f'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/{data_name}')
            response = client.create_solution_version(
                solutionArn=f'arn:aws:personalize:ap-northeast-2:217278714774:solution/{data_name}',
                trainingMode='FULL'    
            )  
        except Exception as e:
            print(e)

order_nm_5_None_item
An error occurred (ResourceAlreadyExistsException) when calling the CreateSolution operation: Another resource with Arn arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_None_item already exists.
order_nm_5_users_1_None
An error occurred (ResourceAlreadyExistsException) when calling the CreateSolution operation: Another resource with Arn arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_None already exists.
order_nm_5_users_1_item
An error occurred (ResourceAlreadyExistsException) when calling the CreateSolution operation: Another resource with Arn arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_item already exists.
order_nm_5_users_1_item_filter
An error occurred (ResourceAlreadyExistsException) when calling the CreateSolution operation: Another resource with Arn arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_item_filter already exists.
order_nm_5_users_1_item_filt

KeyboardInterrupt: 

### 전체 모델 학습 관련 

In [None]:
## 모델 총 3 + 8 = 11개
recipeArn_lst = [ ## recommender는 HPO 없음
    'arn:aws:personalize:::recipe/aws-ecomm-popular-items-by-purchases', ## 고객이 품목을 구매한 횟수를 기준으로 아이템 추천 (userid만 사용, itemid 사용하지 않음)
    'arn:aws:personalize:::recipe/aws-ecomm-frequently-bought-together', ## 고객이 자주 구매하는 아이템과 셀러가 지정한 아이템 추천(userid, itemid 모두 사용)
    'arn:aws:personalize:::recipe/aws-ecomm-recommended-for-you' ## 지정된 사용자 기반으로 항목에 대한 개인화된 추천. 
                                                                ## Amazon Personalize Personalize가 지정한 userId 기준으로 사용자가 구매한 품목을 자동으로 필터링
                                                                ## 사용자 기반으로 개인화 된 추천 (View 데이터가 있을 경우 성능이 더 높음)
]

solution_lst = [
    'arn:aws:personalize:::recipe/aws-user-personalization', ## 맞춤형 추천 시나리오 (Interaction, users, item 데이터 모두 사용)
    'arn:aws:personalize:::recipe/aws-popularity-count', ## 모든 사용자의 행동 데이터 기반으로 가장 인기있는 항목 추천 (사용자와 상호작용이 가장 많은 품목) HPO 적용 불가
    # 'arn:aws:personalize:::recipe/aws-hrnn', ## 계층적 반복 신경망으로 콘솔에서는 더이상 지원하지 않음, hrnn의 업그레이드 버전이 user-personalization
    'arn:aws:personalize:::recipe/aws-hrnn-metadata', ## 고품질 메타데이터를 사용할 수 있을 때 비 메타데이터 모델보다 정확도가 높음
    'arn:aws:personalize:::recipe/aws-hrnn-coldstart', ## 인기 추세를 파악하여 관련성이 적은 항목을 필터링으로 제외 (item dataset 필요)
    'arn:aws:personalize:::recipe/aws-personalized-ranking', ## 아이템의 순위를 정하여 추천하며 비정형 텍스트 메타데이터도 사용 가능 (한국어 지원 불가)
    'arn:aws:personalize:::recipe/aws-similar-items', ## 지정한 항목과 유사한 항목에 대한 추천
    # 'arn:aws:personalize:::recipe/aws-sims' ## sims의 업그레이드 버전이 similar-items
]

In [None]:
data_name = 'order_nm_5'
base_name = f'{data_name}_recom'
num = 0
for model in recipeArn_lst:
    try:
        print(model.split('/')[-1])
        response = client.create_recommender(
            name=f'{base_name}_{num}',
            datasetGroupArn=f'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/{data_name}',
            recipeArn= model
        )
        num += 1
    except Exception as e:
        num += 1
        print(e)
        pass
    
for solution in solution_lst:
    try:
        print(solution.split('/')[-1])
        if solution.split('/')[-1] == 'aws-popularity-count':
            hpo_mode = False
        else:
            hpo_mode = True
        response = client.create_solution(
            name = f'{base_name}_{num}',
            performHPO = hpo_mode,
            recipeArn = solution,
            datasetGroupArn = f'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/{data_name}')
        
        response = client.create_solution_version(
            solutionArn=f'arn:aws:personalize:ap-northeast-2:217278714774:solution/{base_name}_{num}',
            trainingMode='FULL'    
        )
        num += 1
    except Exception as e:
        print(e)
        num += 1


aws-user-personalization
An error occurred (ResourceAlreadyExistsException) when calling the CreateSolution operation: Another resource with Arn arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_None already exists.


### 결과 도출 코드

In [31]:
data = []
model = []
ndcg_5 = []
ndcg_10 = []
ndcg_25 = []
precision_5 = []
precision_10 = []
precision_25 = []
coverage = []

for results in client.list_recommenders()['recommenders']:
    response = client.describe_recommender(recommenderArn=results['recommenderArn'])
    data.append('_'.join(results['name'].split('_')[:3]))
    model.append(results['recipeArn'].split('/')[-1])
    ndcg_5.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_5'])
    ndcg_10.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_10'])
    ndcg_25.append(response['recommender']['modelMetrics']['normalized_discounted_cumulative_gain_at_25'])
    precision_5.append(response['recommender']['modelMetrics']['precision_at_5'])
    precision_10.append(response['recommender']['modelMetrics']['precision_at_10'])
    precision_25.append(response['recommender']['modelMetrics']['precision_at_25'])
    coverage.append(response['recommender']['modelMetrics']['coverage'])
    
for results in client.list_solution_versions()['solutionVersions']:
    try:
        response = client.get_solution_metrics(solutionVersionArn=results['solutionVersionArn'])
        ndcg_5.append(response['metrics']['normalized_discounted_cumulative_gain_at_5'])
        ndcg_10.append(response['metrics']['normalized_discounted_cumulative_gain_at_10'])
        ndcg_25.append(response['metrics']['normalized_discounted_cumulative_gain_at_25'])
        precision_5.append(response['metrics']['precision_at_5'])
        precision_10.append(response['metrics']['precision_at_10'])
        precision_25.append(response['metrics']['precision_at_25'])
        coverage.append(response['metrics']['coverage'])
        response = client.describe_solution_version(solutionVersionArn=results['solutionVersionArn'])
        # data.append('_'.join(response['solutionVersion']['solutionVersionArn'].split('/')[-2].split('_')[:3]))
        data.append(response['solutionVersion']['solutionVersionArn'].split('/')[-2])
        print(response['solutionVersion']['solutionVersionArn'])
        model.append(response['solutionVersion']['recipeArn'].split('/')[-1])       
    except Exception as e:
        print(results['solutionVersionArn'])
        print(e)
    
# for results in client.list_solution_versions(nextToken = client.list_solution_versions()['nextToken'])['solutionVersions']:
#         response = client.get_solution_metrics(solutionVersionArn=results['solutionVersionArn'])
#         ndcg_5.append(response['metrics']['normalized_discounted_cumulative_gain_at_5'])
#         ndcg_10.append(response['metrics']['normalized_discounted_cumulative_gain_at_10'])
#         ndcg_25.append(response['metrics']['normalized_discounted_cumulative_gain_at_25'])
#         precision_5.append(response['metrics']['precision_at_5'])
#         precision_10.append(response['metrics']['precision_at_10'])
#         precision_25.append(response['metrics']['precision_at_25'])
#         coverage.append(response['metrics']['coverage'])
#         response = client.describe_solution_version(solutionVersionArn=results['solutionVersionArn'])
#         data.append('_'.join(response['solutionVersion']['solutionVersionArn'].split('/')[-2].split('_')[:3]))
#         model.append(response['solutionVersion']['recipeArn'].split('/')[-1])
        
results_df = pd.DataFrame(columns = ['data', 'model','ndcg_5','ndcg_10','ndcg_25','precision_5','precision_10','precision_25','coverage'])

for i in list(results_df):
    results_df[i] = globals()[i]

arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_None_item/4e5e816d
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_None/b91454c3
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_item/aba16f2a
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_item_filter/90c9eaad
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_1_item_filter_5/c1588fad
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_60_None/12be5dd9
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_60_item/2a652e34
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_60_item_filter/ec2dc212
arn:aws:personalize:ap-northeast-2:217278714774:solution/order_nm_5_users_60_item_filter_5/16b192e2


In [33]:
results_df.to_csv('personalize_results_all.csv')

### 리소스 정리하기

In [20]:
data_lst = client.list_dataset_groups()
data_lst['datasetGroups']

[{'name': 'order_nm_5_None_item',
  'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_None_item',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2022, 12, 26, 2, 12, 9, 106000, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datetime.datetime(2022, 12, 26, 2, 12, 32, 124000, tzinfo=tzlocal())},
 {'name': 'order_nm_5_users_1_None',
  'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_users_1_None',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2022, 12, 21, 12, 40, 33, 750000, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datetime.datetime(2022, 12, 21, 12, 40, 36, 186000, tzinfo=tzlocal())},
 {'name': 'order_nm_5_users_1_item',
  'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_users_1_item',
  'status': 'ACTIVE',
  'creationDateTime': datetime.datetime(2022, 12, 26, 2, 12, 29, 351000, tzinfo=tzlocal()),
  'lastUpdatedDateTime': datet

In [24]:
roop = True
while roop == False:
    data_lst = client.list_dataset_groups()

    for data_group in data_lst['datasetGroups']:    
        if 'filter' in data_group['name']:
            print(data_group)
            try:
                recom_lst = client.list_recommenders(datasetGroupArn = data_group['datasetGroupArn'])['recommenders']
                for recom in recom_lst: 
                    try: 
                        client.stop_recommender(recommenderArn=recom['recommenderArn']) 
                    except Exception as e:
                        # print(e)
                        pass

                for recom in recom_lst: client.delete_recommender(recommenderArn=recom['recommenderArn'])     
            except:
                pass
            try:    
                sol_lst = client.list_solutions(datasetGroupArn = data_group['datasetGroupArn'])['solutions']
                for sol in sol_lst: client.delete_solution(solutionArn=sol['solutionArn'])        
            except:
                pass
            try:        
                dataset_lst = client.list_datasets(datasetGroupArn = data_group['datasetGroupArn'])['datasets']                           
                for dataset in dataset_lst: client.delete_dataset(datasetArn=dataset['datasetArn'])                
            except:
                pass
            try:
                client.delete_dataset_group(datasetGroupArn=data_group['datasetGroupArn'])
            except:
                pass

{'name': 'order_nm_5_users_60_item_filter_5', 'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_users_60_item_filter_5', 'status': 'DELETE PENDING', 'creationDateTime': datetime.datetime(2022, 12, 26, 4, 47, 59, 480000, tzinfo=tzlocal()), 'lastUpdatedDateTime': datetime.datetime(2022, 12, 26, 5, 58, 39, 735000, tzinfo=tzlocal())}
{'name': 'order_nm_5_users_60_item_filter', 'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_users_60_item_filter', 'status': 'DELETE IN_PROGRESS', 'creationDateTime': datetime.datetime(2022, 12, 26, 4, 28, 39, 978000, tzinfo=tzlocal()), 'lastUpdatedDateTime': datetime.datetime(2022, 12, 26, 5, 58, 31, 947000, tzinfo=tzlocal())}
{'name': 'order_nm_5_users_60_item_filter_5', 'datasetGroupArn': 'arn:aws:personalize:ap-northeast-2:217278714774:dataset-group/order_nm_5_users_60_item_filter_5', 'status': 'DELETE PENDING', 'creationDateTime': datetime.datetime(2022, 12, 26, 4, 47,

ClientError: An error occurred (ThrottlingException) when calling the ListDatasetGroups operation (reached max retries: 4): Rate exceeded