In [783]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

### VOD8,9 통합 전처리

In [784]:
vod_08 = pd.read_csv('../data/데이터스쿨3차_2308월/데이터스쿨_3차_VOD_2308.csv', encoding = 'cp949', sep = '\t')
vod_09 = pd.read_csv('../data/데이터스쿨3차_2309월/데이터스쿨_3차_VOD_2309.csv', encoding = 'cp949', sep = '\t')

# 8,9월 데이터 합치기
vod_89 = pd.concat([vod_08, vod_09], ignore_index=True)
print(vod_89.shape)
vod_89.head(2)

(10657, 9)


Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,use_tms,SMRY,ACTR_DISP,disp_rtm,strt_dt
0,65941000,(HD)그것이알고싶다 1361회(23/07/22),TV 시사/교양,기타,4800,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,1:20,20230812163507
1,66873000,(HD)그것이알고싶다 1361회(23/07/22),TV 시사/교양,기타,4800,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,1:20,20230816205227


In [785]:
def preprocessing(data):
  df = data.copy()
  
  # disp_rtm 문자열을 분 단위로 변경
  def convert_runtime(runtime_str):
    # 입력값이 NaN이면 0 반환
    if pd.isna(runtime_str):
        return 0
    hours = int(runtime_str.split(':')[0])
    minutes = int(runtime_str.split(':')[1])
    total_minutes = hours * 60 + minutes
    return total_minutes

  df['disp_rtm'] = df['disp_rtm'].apply(convert_runtime)

  # 런타임 0분인 것 제거
  df = df[df['disp_rtm'] != 0]

  # 무삭제 제거
  df['asset_nm'] = df['asset_nm'].str.replace(r'무삭제판|무삭제', '', regex = True)
  
  # 예고편 제거
  df = df[~df['asset_nm'].str.contains(r'\(예고편\)|\(예고\)', regex=True)]
  
  # 예약구매, 사전구매 제거
  df = df[~df['asset_nm'].str.contains('예약구매|사전구매')]

  # 00회가 포함된 행은 런타임이 0또는 1이므로 제거
  df = df[~df['asset_nm'].str.contains(r'\b00회\b')]
  
  ## 괄호와 그 안의 내용 제거 
  df['asset_nm'] = df['asset_nm'].str.replace(r'\([^()]*\)', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\[[^\[\]]*\]', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\.\.\.', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\…', '', regex=True) # … 기호 제거
  df['asset_nm'] = df['asset_nm'].str.replace('-', " ")
  df['asset_nm'] = df['asset_nm'].str.rstrip('.')
  # df['asset_nm'] = df['asset_nm'].str.strip()

  # df3['series_nm'] = df3['asset_nm'].str.replace(r'\d+회$', '', regex=True)
  # df3['series_nm'] = df3['series_nm'].str.replace(r'\d+회\.', '', regex=True) 
  # df3['series_nm'] = df3['series_nm'].str.replace(r'\d+화$', regex = True)

  # use_tms 분 단위로 변경
  df['use_tms'] = round(df['use_tms'] / 60, 1)

  # 셋탑번호가 66056000인 것 삭제 - content 에서 이상치 아이디로 판별
  df = df[df['subsr'] != 66056000]

  return df

In [786]:
df = preprocessing(vod_89)
df.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,use_tms,SMRY,ACTR_DISP,disp_rtm,strt_dt
0,65941000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230812163507
1,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230816205227
2,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,12.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230829194727
3,61689000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230813130609
4,61619000,꼬리에꼬리를무는그날이야기 37회,TV 시사/교양,기타,69.7,"살인범의 미토콘드리아 - 2006 냉동고 살인사건. 2006년 7월 23일, 서울 ...","장도연,장현성,장성규",73,20230804092737


In [787]:
# 프로그램 회차별 정보
df1 = df[['asset_nm', 'ct_cl', 'genre_of_ct_cl', 'ACTR_DISP', 'disp_rtm']].drop_duplicates().reset_index(drop = True)
df1.head()

Unnamed: 0,asset_nm,ct_cl,genre_of_ct_cl,ACTR_DISP,disp_rtm
0,그것이알고싶다 1361회,TV 시사/교양,기타,김상중,80
1,꼬리에꼬리를무는그날이야기 37회,TV 시사/교양,기타,"장도연,장현성,장성규",73
2,꼬리에꼬리를무는그날이야기 61회,TV 시사/교양,기타,"장도연,장현성,장성규",73
3,인간극장 3338회,TV 시사/교양,기타,명선 스님,32
4,꼬리에꼬리를무는그날이야기 89회,TV 시사/교양,기타,"장도연,장현성,장성규",78


In [788]:
# 유저마다 각 회차를 본 시간을 더함
df2 = pd.DataFrame(df.groupby(['subsr','asset_nm', 'ct_cl', 'genre_of_ct_cl', 'disp_rtm'])['use_tms'].sum()).reset_index()
df2 = df2[(df2['disp_rtm'] != 0) & (df2['use_tms'] != 0)]
df2

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3
...,...,...,...,...,...,...
5705,67148000,타요의 씽씽극장 동요2 10회,키즈,기타,2,2.0
5706,67148000,타요의 씽씽극장 동요2 11회,키즈,기타,2,0.5
5707,67154000,스트릿 우먼 파이터 2 04회,TV 연예/오락,기타,131,1.2
5708,67161000,스파이 코드명 포춘,영화,액션/어드벤쳐,114,0.2


In [789]:
# 시청시간으로 유저가 이 회차를 시청했는지 여부를 결정
## 시청시간을 런타임으로 나눠 0.1 이 넘으면 시청한 것으로 간주
# 0.1 : 0.051
# 0.2 : 0.055
# 0.3 : 0.053
# 0.4 : 0.053
# 0.5 : 0.052
# 0.6 : 0.053
# 하드 유저에게는 0.6이 가장 좋고, 종합적으로는 0.2 가 가장 좋다고 결론
df2['watched'] = df2['use_tms'] / df2['disp_rtm']
df2['watched'] = df2['watched'].apply(lambda x : 1 if x >= 0.6 else 0)
df2.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms,watched
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4,0
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1,0
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6,0
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8,0
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3,0


In [790]:
# 회차 제거
df11 = df1.copy()

df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+회$', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+회\.', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+화$', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.strip()
df11['asset_nm'] = df11['asset_nm'].str.rstrip('.')
df11

Unnamed: 0,asset_nm,ct_cl,genre_of_ct_cl,ACTR_DISP,disp_rtm
0,그것이알고싶다,TV 시사/교양,기타,김상중,80
1,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",73
2,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",73
3,인간극장,TV 시사/교양,기타,명선 스님,32
4,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",78
...,...,...,...,...,...
3850,엄마가 화났다,키즈,학습,-,8
3851,핑크퐁 자동차 동화,키즈,학습,핑크퐁,4
3852,간질간질,키즈,학습,-,6
3853,월간 아기상어,키즈,학습,아기상어,1


In [791]:
# 유저 시청 정보의 회차 제거한 변수 추가
df2['series_nm'] = df2['asset_nm'].str.replace(r'\d+회$', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.replace(r'\d+회\.', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.replace(r'\d+화$', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.strip()
df2['series_nm'] = df2['series_nm'].str.rstrip('.')
df2.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms,watched,series_nm
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4,0,소방서 옆 경찰서
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1,0,소방서 옆 경찰서
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6,0,신성한 이혼
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8,0,신성한 이혼
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3,0,신성한 이혼


In [792]:
df3 = df2[['subsr', 'series_nm', 'ct_cl',	'genre_of_ct_cl', 'watched']].copy()
df3.head()

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched
0,59879000,소방서 옆 경찰서,TV드라마,기타,0
1,59879000,소방서 옆 경찰서,TV드라마,기타,0
2,59879000,신성한 이혼,TV드라마,기타,0
3,59879000,신성한 이혼,TV드라마,기타,0
4,59879000,신성한 이혼,TV드라마,기타,0


In [793]:
# 유저의 시리즈별 시청 횟수
df4 = df3.groupby(['subsr', 'series_nm', 'ct_cl', 'genre_of_ct_cl'])['watched'].sum().reset_index()
df4 = df4[df4['watched']!= 0]
df4

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched
2,59895000,금이야 옥이야,TV드라마,기타,1
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,1
4,59900000,그것이알고싶다,TV 시사/교양,기타,3
8,59900000,범죄도시3,영화,액션/어드벤쳐,1
16,59930000,가면의 여왕,TV드라마,기타,6
...,...,...,...,...,...
2066,67140000,경남 통영 2부,우리동네,연예/오락,1
2067,67140000,밀수,영화,액션/어드벤쳐,1
2068,67140000,잠자는 숲속의 공주,키즈,기타,1
2070,67148000,타요의 씽씽극장 동요2,키즈,기타,10


In [794]:
# 8~9월 시리즈별 시청된 총 횟수
df5 = pd.DataFrame(df11[['asset_nm', 'ct_cl', 'genre_of_ct_cl']].value_counts().reset_index())
df5.columns = ['series_nm', 'ct_cl', 'genre_of_ct_cl', 'watched_all']
df5

Unnamed: 0,series_nm,ct_cl,genre_of_ct_cl,watched_all
0,금이야 옥이야,TV드라마,기타,83
1,연희공략: 건륭황제의여인,TV드라마,외화 시리즈,63
2,TV소설 은희,TV드라마,기타,63
3,인간극장,TV 시사/교양,기타,55
4,런닝맨,TV 연예/오락,기타,54
...,...,...,...,...
1019,봉신연의,영화,액션/어드벤쳐,1
1020,부산,영화,액션/어드벤쳐,1
1021,북 오브 러브,영화,멜로,1
1022,분노의 질주: 라이드 오어 다이,영화,액션/어드벤쳐,1


In [795]:
df6 = df4.merge(df5, on = ['series_nm', 'ct_cl', 'genre_of_ct_cl'], how = 'left')
df6

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched,watched_all
0,59895000,금이야 옥이야,TV드라마,기타,1,83
1,59900000,2022 역사저널 그날,TV 시사/교양,기타,1,3
2,59900000,그것이알고싶다,TV 시사/교양,기타,3,21
3,59900000,범죄도시3,영화,액션/어드벤쳐,1,1
4,59930000,가면의 여왕,TV드라마,기타,6,6
...,...,...,...,...,...,...
1195,67140000,경남 통영 2부,우리동네,연예/오락,1,1
1196,67140000,밀수,영화,액션/어드벤쳐,1,2
1197,67140000,잠자는 숲속의 공주,키즈,기타,1,1
1198,67148000,타요의 씽씽극장 동요2,키즈,기타,10,13


In [796]:
# 유저의 프로그램 총 시청 횟수
df7 = df6.groupby(['subsr'])['watched'].sum().reset_index()
df7.columns = ['subsr', 'watched_cnt']
df7

Unnamed: 0,subsr,watched_cnt
0,59895000,1
1,59900000,5
2,59930000,11
3,59933000,9
4,60040000,1
...,...,...
288,67107000,2
289,67117000,10
290,67140000,3
291,67148000,10


In [797]:
df8 = df6.merge(df7, on = 'subsr', how = 'left')
df8 = df8[df8['watched_cnt'] != 0].reset_index(drop = True)
df8

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched,watched_all,watched_cnt
0,59895000,금이야 옥이야,TV드라마,기타,1,83,1
1,59900000,2022 역사저널 그날,TV 시사/교양,기타,1,3,5
2,59900000,그것이알고싶다,TV 시사/교양,기타,3,21,5
3,59900000,범죄도시3,영화,액션/어드벤쳐,1,1,5
4,59930000,가면의 여왕,TV드라마,기타,6,6,11
...,...,...,...,...,...,...,...
1195,67140000,경남 통영 2부,우리동네,연예/오락,1,1,3
1196,67140000,밀수,영화,액션/어드벤쳐,1,2,3
1197,67140000,잠자는 숲속의 공주,키즈,기타,1,1,3
1198,67148000,타요의 씽씽극장 동요2,키즈,기타,10,13,10


In [798]:
# 시청횟수가 5개 이상인 유저(heavy users)
# 시청횟수가 3~4개인 유저(medium users)
# 시청횟수가 1~2개인 유저(light users) 
user_cnt = df8['subsr'].value_counts()
heavy_users = user_cnt[user_cnt >= 5].index
medium_users = user_cnt[user_cnt >= 3].index
light_users = user_cnt[user_cnt >= 1].index

vod_heavy = df8[df8['subsr'].isin(heavy_users)]
vod_medium = df8[df8['subsr'].isin(medium_users)]
vod_light = df8[df8['subsr'].isin(light_users)]

In [799]:
# 시청 여부, 해당 시리즈 총 횟수, 유저의 프로그램 총 시청 횟수를 이용해
# 유저의 프로그램 선호도를 측정
import numpy as np
def scoring(df):
  N = df['watched_all'] # 해당 프로그램 전체 회차 수
  L = df['watched_cnt'] # 유저의 총 시청 프로그램 회차 수
  n = df['watched'] # 유저의 해당 프로그램 시청 수
  lam = np.log(2) / 2
  w1 = 1 - np.exp(-1 * lam * N)
  data = pd.DataFrame([N, L]).T
  def custom_weight(data):
    if data['watched_cnt'] < data['watched_all']:
      return data['watched_all'] / data['watched_cnt']
    else:
      return 1

  w2 = data.apply(custom_weight, axis = 1)

  score = (n / N) * w1 * w2

  return score

In [800]:
vod_heavy['score'] = scoring(vod_heavy) 
vod_medium['score'] = scoring(vod_medium)
vod_light['score'] = scoring(vod_light)

In [801]:
def add_cat_rename(vod_data):
    vod = vod_data[['subsr', 'series_nm', 'ct_cl', 'genre_of_ct_cl', 'score']].copy()
    vod = vod.sort_values(by = 'subsr').reset_index(drop = True)
    vod['category'] = vod['ct_cl'].apply(lambda x : x if x in ['영화', '키즈'] else 'TV프로그램')
    vod['rename'] = vod['series_nm'].apply(lambda x : x.replace(' ', '') if isinstance(x, str) else x)

    return vod

vod_heavy = add_cat_rename(vod_heavy)
vod_medium = add_cat_rename(vod_medium)
vod_light = add_cat_rename(vod_light)

In [802]:
vod_id = pd.read_csv('../data/vod_list_add10_1220.csv', index_col=0)
vod_id = vod_id[['rename', 'Category', 'vod_id']]
vod_id.columns = ['rename', 'category', 'vod_id']

vod_heavy_id = vod_heavy.merge(vod_id, on = ['rename', 'category'], how = 'left')[['subsr', 'vod_id', 'score']]
vod_medium_id = vod_medium.merge(vod_id, on = ['rename', 'category'], how = 'left')[['subsr', 'vod_id', 'score']]
vod_light_id = vod_light.merge(vod_id, on = ['rename', 'category'], how = 'left')[['subsr', 'vod_id', 'score']]

In [803]:
vod_heavy_id.to_csv('../data/vod89_heavy.csv', index = 0)
vod_medium_id.to_csv('../data/vod89_medium.csv', index = 0)
vod_light_id.to_csv('../data/vod89_light.csv', index = 0)

vod_heavy_id = pd.read_csv('../data/vod89_heavy.csv')
vod_medium_id = pd.read_csv('../data/vod89_medium.csv')
vod_light_id = pd.read_csv('../data/vod89_light.csv')

### 모델링

In [627]:
vod_heavy_id

Unnamed: 0,subsr,vod_id,score
0,59900000,572,0.215482
1,59900000,297,0.374741
2,59900000,149,0.292893
3,59900000,2539,0.292893
4,59900000,3316,0.292893
...,...,...,...
1040,67055000,13,0.132597
1041,67055000,315,0.952381
1042,67055000,16,0.100000
1043,67055000,341,0.292893


In [628]:
# 필요한 Surprise 알고리즘 불러오기
from surprise import SVD, BaselineOnly, SVDpp, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, NMF, KNNWithMeans, KNNBasic
from surprise import accuracy
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# surprise 데이터 형식으로 변환
def convert_traintest_dataframe_forsurprise(training_dataframe):
    reader = Reader(rating_scale=(0, 1)) # 이 범위를 넘으면 양극값으로 대체
    trainset = Dataset.load_from_df(training_dataframe[['subsr', 'vod_id', 'score']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    return trainset

trainset = convert_traintest_dataframe_forsurprise(vod_heavy_id)

### KNNBaseline

In [629]:
# from surprise.model_selection import GridSearchCV
# param_grid = {'k' : np.arange(1, 20, 1), 'sim_options' : {'name' : ['pearson_baseline', 'cosine'], 'user_based' : [True, False]}, 'random_state' : [42], 'verbose' : [False]}
            
# gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv = 3)

# reader = Reader(rating_scale=(0,1))
# train_set = Dataset.load_from_df(vod_score[['subsr', 'vod_id', 'score']], reader)

# gs.fit(train_set)
# gs.best_params

In [630]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # item-based similarity
bsl_options = {'method' : 'sgd', 'n_epochs' : 1}
knnbaseline = KNNBaseline(k = 40, sim_options=sim_options, random_state = 42, min_k= 1, 
                          bsl_options=bsl_options)

knnbaseline.fit(trainset)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x24a858fbed0>

In [631]:
user_id = sorted(vod_heavy_id.subsr.unique())
vod_id = sorted(vod_heavy_id.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbaseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

KeyboardInterrupt: 

In [None]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:25].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
0,59900000,111,3028,256,391,245,289,38,296,432,...,884,398,303,848,858,720,1086,954,726,754
1,59930000,728,456,1156,1234,111,1347,14,1233,888,...,1086,954,726,754,724,991,357,1395,115,2227
2,60067000,954,976,849,992,884,1136,2404,810,1156,...,848,858,720,1086,726,754,724,991,357,1395
3,60224000,410,835,1166,256,508,107,1886,784,842,...,1347,14,1233,888,884,398,303,848,858,720
4,60326000,1086,1639,1523,887,1233,2041,381,908,1347,...,303,1785,2396,162,2045,1156,1234,111,14,888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,66873000,1156,1886,6,835,95,14,111,1166,508,...,1234,1347,1233,884,398,303,848,858,720,1086
71,66875000,357,731,117,1347,1068,2775,3502,107,1233,...,1131,1362,3159,3457,887,550,889,535,2469,1834
72,66900000,848,1156,1234,1347,1233,888,884,398,303,...,724,991,357,1395,115,2227,789,1077,816,498
73,67008000,1156,1234,111,1347,14,1233,888,884,398,...,726,754,724,991,357,1395,115,2227,789,1077


In [None]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[111, 3028, 256, 391, 245, 289, 38, 296, 432, ..."
1,59930000,"[728, 456, 1156, 1234, 111, 1347, 14, 1233, 88..."
2,60067000,"[954, 976, 849, 992, 884, 1136, 2404, 810, 115..."
3,60224000,"[410, 835, 1166, 256, 508, 107, 1886, 784, 842..."
4,60326000,"[1086, 1639, 1523, 887, 1233, 2041, 381, 908, ..."
...,...,...
70,66873000,"[1156, 1886, 6, 835, 95, 14, 111, 1166, 508, 4..."
71,66875000,"[357, 731, 117, 1347, 1068, 2775, 3502, 107, 1..."
72,66900000,"[848, 1156, 1234, 1347, 1233, 888, 884, 398, 3..."
73,67008000,"[1156, 1234, 111, 1347, 14, 1233, 888, 884, 39..."


In [None]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_heavy_id[vod_heavy_id.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59900000,"[111, 3028, 256, 391, 245, 289, 38, 296, 432, ..."
1,59930000,"[728, 456, 1156, 1234, 111, 1347, 14, 1233, 88..."
2,60067000,"[954, 976, 849, 992, 884, 1136, 2404, 810, 115..."
3,60224000,"[410, 835, 1166, 256, 508, 107, 1886, 784, 842..."
4,60326000,"[1086, 1639, 1523, 887, 1233, 2041, 381, 908, ..."
...,...,...
70,66873000,"[1156, 1886, 6, 835, 95, 14, 111, 1166, 508, 4..."
71,66875000,"[357, 731, 117, 1347, 1068, 2775, 3502, 107, 1..."
72,66900000,"[848, 1156, 1234, 1347, 1233, 888, 884, 398, 3..."
73,67008000,"[1156, 1234, 111, 1347, 14, 1233, 888, 884, 39..."


In [None]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[2610, 2291, 213, 4880, 2869, 2415, 200, 2546]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [None]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict_1)

0.09999999999999999

In [None]:
# knnbaseline 결과를 추천 리스트로 생성
# vod_predict_1.to_csv('../data/result_vod_over3.csv', index = 0)
# vod_predict_1.to_csv('../data/result_vod_all.csv', index = 0)

### SVD, SVDpp

In [None]:
# n_epochs=95, lr_all=0.005, reg_all=0.06, random_state=42 -> 0.045
svd = SVD(n_factors=100, n_epochs=150, lr_all=0.005, reg_all=0.05, random_state=42)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24d72b6a150>

In [None]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(svd.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59900000,0,0.312052
1,59900000,3,0.494225
2,59900000,6,0.282163
3,59900000,7,0.331777
4,59900000,8,0.265454
...,...,...,...
103336,67140000,4634,0.281973
103337,67140000,4647,0.424890
103338,67140000,4685,0.386863
103339,67140000,4698,0.368468


In [None]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59900000,858,3502,1156,848,114,315,29,1234,706,...,2093,410,2357,360,3383,1236,724,992,1168,123
1,59930000,728,357,495,530,724,848,954,973,1523,...,15,700,858,356,726,14,739,866,112,2457
2,60050000,360,731,1156,3502,866,2457,728,739,976,...,1234,29,858,323,315,955,3979,1927,14,51
3,60067000,954,976,849,414,1068,720,858,315,992,...,835,249,754,789,824,1156,357,687,1395,1077
4,60169000,991,724,1156,29,1168,414,1069,357,848,...,450,1523,496,751,2384,1395,677,356,778,739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,66900000,848,315,29,796,731,360,1506,398,1168,...,1156,1068,728,117,1077,1278,114,862,955,3624
129,67000000,724,1639,739,1233,29,356,1018,1156,357,...,991,1506,2548,1086,1069,1207,2384,390,726,111
130,67008000,858,315,757,1506,33,414,112,29,43,...,3502,1156,685,1278,726,731,450,2775,2632,976
131,67055000,315,848,866,1506,884,357,858,117,849,...,1077,1278,1233,690,414,992,1207,991,724,2457


In [None]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[858, 3502, 1156, 848, 114, 315, 29, 1234, 706..."
1,59930000,"[728, 357, 495, 530, 724, 848, 954, 973, 1523,..."
2,60050000,"[360, 731, 1156, 3502, 866, 2457, 728, 739, 97..."
3,60067000,"[954, 976, 849, 414, 1068, 720, 858, 315, 992,..."
4,60169000,"[991, 724, 1156, 29, 1168, 414, 1069, 357, 848..."
...,...,...
128,66900000,"[848, 315, 29, 796, 731, 360, 1506, 398, 1168,..."
129,67000000,"[724, 1639, 739, 1233, 29, 356, 1018, 1156, 35..."
130,67008000,"[858, 315, 757, 1506, 33, 414, 112, 29, 43, 10..."
131,67055000,"[315, 848, 866, 1506, 884, 357, 858, 117, 849,..."


In [None]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59900000,"[858, 3502, 1156, 848, 114, 315, 29, 1234, 706..."
1,59930000,"[728, 357, 495, 530, 724, 848, 954, 973, 1523,..."
2,60050000,"[360, 731, 1156, 3502, 866, 2457, 728, 739, 97..."
3,60067000,"[954, 976, 849, 414, 1068, 720, 858, 315, 992,..."
4,60169000,"[991, 724, 1156, 29, 1168, 414, 1069, 357, 848..."
...,...,...
128,66900000,"[848, 315, 29, 796, 731, 360, 1506, 398, 1168,..."
129,67000000,"[724, 1639, 739, 1233, 29, 356, 1018, 1156, 35..."
130,67008000,"[858, 315, 757, 1506, 33, 414, 112, 29, 43, 10..."
131,67055000,"[315, 848, 866, 1506, 884, 357, 858, 117, 849,..."


In [None]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [None]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.036082474226804134

### BaselineOnly

In [None]:
bsl_options = {'method' : 'als', 'n_epochs' : 60}
baseline = BaselineOnly(bsl_options=bsl_options)
baseline.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x24d7030ae50>

In [None]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(baseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59900000,0,0.339575
1,59900000,3,0.350939
2,59900000,6,0.333870
3,59900000,7,0.334418
4,59900000,8,0.284427
...,...,...,...
103336,67140000,4634,0.350837
103337,67140000,4647,0.348260
103338,67140000,4685,0.346767
103339,67140000,4698,0.348222


In [None]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59900000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
1,59930000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
2,60050000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
3,60067000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
4,60169000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,66900000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
129,67000000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
130,67008000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506
131,67055000,1156,848,1234,14,111,1233,1347,858,720,...,398,991,357,726,1166,976,2227,731,958,1506


In [None]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
1,59930000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
2,60050000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
3,60067000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
4,60169000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
...,...,...
128,66900000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
129,67000000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
130,67008000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
131,67055000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."


In [None]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59900000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
1,59930000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
2,60050000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
3,60067000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
4,60169000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
...,...,...
128,66900000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
129,67000000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
130,67008000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."
131,67055000,"[1156, 848, 1234, 14, 111, 1233, 1347, 858, 72..."


In [None]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [None]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.017525773195876292

### KNNBasic

In [None]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # item-based similarity
bsl_options = {'method' : 'sgd', 'n_epochs' : 1}
knnbasic = KNNBasic(k = 40, sim_options=sim_options, random_state = 42, bsl_options = bsl_options, min_k=1)
knnbasic.fit(trainset)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x24af8dff810>

In [None]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbasic.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59900000,3,0.361827
1,59900000,6,0.361827
2,59900000,7,0.361827
3,59900000,8,0.599586
4,59900000,12,0.361827
...,...,...,...
66224,67140000,4553,0.361827
66225,67140000,4627,0.361827
66226,67140000,4629,0.361827
66227,67140000,4647,0.361827


In [None]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59900000,256,3028,245,44,38,289,292,296,432,...,2320,2338,2323,2112,2343,2344,2352,2353,2356,2357
1,59930000,728,456,3,2314,2289,2295,2299,2300,2301,...,2274,2257,2140,2091,2093,2105,2111,2112,2125,2132
2,60050000,360,3,2320,2289,2295,2299,2300,2301,2305,...,2257,2084,2140,2091,2093,2105,2111,2112,2125,2132
3,60067000,954,976,992,849,1136,2404,810,884,1347,...,2352,2289,2314,2245,2274,2140,2091,2093,2105,2111
4,60169000,3,6,2284,2289,2295,2299,2300,2301,2305,...,2257,2245,2132,2084,2089,2091,2093,2111,2112,2125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,66900000,848,3,2300,2245,2257,2274,2284,2289,2295,...,2223,2356,2218,2083,2084,2089,2091,2093,2105,2111
99,67000000,3,2775,2274,2284,2289,2295,2299,2300,2301,...,2245,2244,2125,2084,2089,2091,2093,2105,2111,2112
100,67008000,3,6,2284,2289,2295,2299,2300,2301,2305,...,2257,2245,2132,2089,2091,2093,2105,2111,2112,2125
101,67055000,315,1156,3,2314,2284,2289,2295,2299,2300,...,2274,2245,2364,2132,2089,2091,2093,2105,2111,2112


In [None]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[256, 3028, 245, 44, 38, 289, 292, 296, 432, 5..."
1,59930000,"[728, 456, 3, 2314, 2289, 2295, 2299, 2300, 23..."
2,60050000,"[360, 3, 2320, 2289, 2295, 2299, 2300, 2301, 2..."
3,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
4,60169000,"[3, 6, 2284, 2289, 2295, 2299, 2300, 2301, 230..."
...,...,...
98,66900000,"[848, 3, 2300, 2245, 2257, 2274, 2284, 2289, 2..."
99,67000000,"[3, 2775, 2274, 2284, 2289, 2295, 2299, 2300, ..."
100,67008000,"[3, 6, 2284, 2289, 2295, 2299, 2300, 2301, 230..."
101,67055000,"[315, 1156, 3, 2314, 2284, 2289, 2295, 2299, 2..."


In [None]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59900000,"[256, 3028, 245, 44, 38, 289, 292, 296, 432, 5..."
1,59930000,"[728, 456, 3, 2314, 2289, 2295, 2299, 2300, 23..."
2,60050000,"[360, 3, 2320, 2289, 2295, 2299, 2300, 2301, 2..."
3,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
4,60169000,"[3, 6, 2284, 2289, 2295, 2299, 2300, 2301, 230..."
...,...,...
98,66900000,"[848, 3, 2300, 2245, 2257, 2274, 2284, 2289, 2..."
99,67000000,"[3, 2775, 2274, 2284, 2289, 2295, 2299, 2300, ..."
100,67008000,"[3, 6, 2284, 2289, 2295, 2299, 2300, 2301, 230..."
101,67055000,"[315, 1156, 3, 2314, 2284, 2289, 2295, 2299, 2..."


In [None]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [None]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.08374999999999998

### 앙상블적용

In [None]:
knnbaseline_predict = []
knnbasic_predict = []
for user in user_id:
    for vod in vod_id:
        knnbaseline_predict.append(knnbaseline.predict(user, vod)[0:4])
        knnbasic_predict.append(knnbasic.predict(user, vod)[0:4])
        
knnbaseline_predict = pd.DataFrame(knnbaseline_predict, columns = ['subsr', 'vod_id', 'real', 'predict'])
knnbaseline_predict = knnbaseline_predict[['subsr', 'vod_id', 'predict']]

knnbasic_predict = pd.DataFrame(knnbasic_predict, columns = ['subsr', 'vod_id', 'real', 'predict'])
knnbasic_predict = knnbasic_predict[['subsr', 'vod_id', 'predict']]

In [None]:
result = knnbasic_predict.merge(knnbaseline_predict, how = 'left', on = ['subsr', 'vod_id'])
result['predict'] = (result['predict_x'] + result['predict_y']) / 2
result = result[['subsr' , 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59900000,0,0.361366
1,59900000,3,0.361679
2,59900000,6,0.361278
3,59900000,7,0.360955
4,59900000,8,0.354527
...,...,...,...
103336,67140000,4634,0.361918
103337,67140000,4647,0.361879
103338,67140000,4685,0.361879
103339,67140000,4698,0.361882


In [None]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59900000,111,3028,143,256,245,391,289,432,296,...,303,1077,720,247,858,954,123,991,357,726
1,59930000,728,456,1156,1234,848,1347,14,111,1233,...,991,357,726,1395,724,43,115,830,754,739
2,60050000,360,1156,1234,848,1347,14,111,1233,888,...,357,726,1395,724,43,115,830,754,739,32
3,60067000,954,976,849,992,884,1136,2404,810,1347,...,858,123,991,357,726,1395,724,43,115,830
4,60169000,1156,1234,848,1347,14,111,1233,888,1166,...,726,1395,724,43,115,830,754,739,32,2227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,66900000,848,1156,1234,1347,1233,888,1166,884,398,...,724,43,115,830,754,739,32,2227,1086,114
129,67000000,1156,1234,848,1347,14,111,1233,888,1166,...,726,1395,724,43,115,830,754,739,32,2227
130,67008000,1156,1234,848,1347,14,111,1233,888,1166,...,726,1395,724,43,115,830,754,739,32,2227
131,67055000,315,1156,1234,848,1347,1233,888,1166,884,...,1395,724,43,115,830,754,739,32,2227,1086


In [None]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[111, 3028, 143, 256, 245, 391, 289, 432, 296,..."
1,59930000,"[728, 456, 1156, 1234, 848, 1347, 14, 111, 123..."
2,60050000,"[360, 1156, 1234, 848, 1347, 14, 111, 1233, 88..."
3,60067000,"[954, 976, 849, 992, 884, 1136, 2404, 810, 134..."
4,60169000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
...,...,...
128,66900000,"[848, 1156, 1234, 1347, 1233, 888, 1166, 884, ..."
129,67000000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
130,67008000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
131,67055000,"[315, 1156, 1234, 848, 1347, 1233, 888, 1166, ..."


In [None]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59900000,"[111, 3028, 143, 256, 245, 391, 289, 432, 296,..."
1,59930000,"[728, 456, 1156, 1234, 848, 1347, 14, 111, 123..."
2,60050000,"[360, 1156, 1234, 848, 1347, 14, 111, 1233, 88..."
3,60067000,"[954, 976, 849, 992, 884, 1136, 2404, 810, 134..."
4,60169000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
...,...,...
128,66900000,"[848, 1156, 1234, 1347, 1233, 888, 1166, 884, ..."
129,67000000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
130,67008000,"[1156, 1234, 848, 1347, 14, 111, 1233, 888, 11..."
131,67055000,"[315, 1156, 1234, 848, 1347, 1233, 888, 1166, ..."


In [None]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [None]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict_1)

0.08144329896907213

### VOD 3이상 + VOD all + content 합친 결과

In [None]:
result_vod_over3 = pd.read_csv('../data/result_vod_over3.csv')
result_vod_all = pd.read_csv('../data/result_vod_all.csv')
result_content = pd.read_csv('../data/semi_con_ensemble.csv')
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()

In [None]:
vod_user_und3 = set(result_vod_all.subsr.values) - set(result_vod_over3.subsr.values)
result_vod_und3 = result_vod_all[result_vod_all.subsr.isin(vod_user_und3)].reset_index(drop = True)

In [None]:
import math
def to_list(string_data):
    li = string_data.strip('[]').split(',')
    A = []
    for item in li:
        try:
            num = float(item)
            if not math.isnan(num):
                A.append(num)
        except ValueError:
            pass
    
    return A

result_vod_over3.vod_id = result_vod_over3.vod_id.apply(lambda x : to_list(x))
result_vod_und3.vod_id = result_vod_und3.vod_id.apply(lambda x : to_list(x))

In [None]:
content_user_only = set(result_content.subsr.values) - set(result_vod_all.subsr.values)
result_content_only = result_content[result_content.subsr.isin(content_user_only)]
result_content_only.index = result_content_only.subsr
result_content_only = result_content_only.apply(lambda x : x[1:].tolist(), axis = 1).reset_index()
result_content_only.columns = ['subsr', 'vod_id']

In [None]:
result_vod_und3_content = pd.concat([result_content_only, result_vod_und3]).reset_index(drop = True)
result_all = pd.concat([result_content_only, result_vod_und3, result_vod_over3]).reset_index(drop = True)

In [None]:
print(precision_k(testdata, result_vod_over3, K = 10))
print(precision_k(testdata, result_vod_und3, K = 10))
print(precision_k(testdata, result_content_only, K = 10))
print(precision_k(testdata, result_vod_und3_content, K = 10))
print(precision_k(testdata, result_all, K = 10))

0.08144329896907213
0.040000000000000015
0.015384615384615384
0.030769230769230785
0.05522388059701487


In [None]:
result_all.to_csv('../data/vod_all.csv', index = 0)

In [None]:
df_expanded = pd.DataFrame(result_all.vod_id.tolist()).fillna(0).astype(int)
df_expanded = df_expanded.iloc[:, :10]
result_all = pd.concat([result_all[['subsr']], df_expanded], axis = 1)

In [None]:
result_all.to_csv('../data/recommend_all_user.csv', index = 0)