In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

### VOD8,9 통합 전처리

In [15]:
vod_08 = pd.read_csv('../data/데이터스쿨3차_2308월/데이터스쿨_3차_VOD_2308.csv', encoding = 'cp949', sep = '\t')
vod_09 = pd.read_csv('../data/데이터스쿨3차_2309월/데이터스쿨_3차_VOD_2309.csv', encoding = 'cp949', sep = '\t')

# 8,9월 데이터 합치기
vod_89 = pd.concat([vod_08, vod_09], ignore_index=True)
vod_89.head(2)

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,use_tms,SMRY,ACTR_DISP,disp_rtm,strt_dt
0,65941000,(HD)그것이알고싶다 1361회(23/07/22),TV 시사/교양,기타,4800,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,1:20,20230812163507
1,66873000,(HD)그것이알고싶다 1361회(23/07/22),TV 시사/교양,기타,4800,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,1:20,20230816205227


In [16]:
def preprocessing(data):
  df = data.copy()
  
  # disp_rtm 문자열을 분 단위로 변경
  def convert_runtime(runtime_str):
    # 입력값이 NaN이면 0 반환
    if pd.isna(runtime_str):
        return 0
    hours = int(runtime_str.split(':')[0])
    minutes = int(runtime_str.split(':')[1])
    total_minutes = hours * 60 + minutes
    return total_minutes

  df['disp_rtm'] = df['disp_rtm'].apply(convert_runtime)

  # 런타임 0분인 것 제거
  df = df[df['disp_rtm'] != 0]

  # 무삭제 제거
  df['asset_nm'] = df['asset_nm'].str.replace(r'무삭제판|무삭제', '', regex = True)
  
  # 예고편 제거
  df = df[~df['asset_nm'].str.contains(r'\(예고편\)|\(예고\)', regex=True)]
  
  # 예약구매, 사전구매 제거
  df = df[~df['asset_nm'].str.contains('예약구매|사전구매')]

  # 00회가 포함된 행은 런타임이 0또는 1이므로 제거
  df = df[~df['asset_nm'].str.contains(r'\b00회\b')]
  
  ## 괄호와 그 안의 내용 제거 
  df['asset_nm'] = df['asset_nm'].str.replace(r'\([^()]*\)', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\[[^\[\]]*\]', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\.\.\.', '', regex=True)
  df['asset_nm'] = df['asset_nm'].str.replace(r'\…', '', regex=True) # … 기호 제거
  df['asset_nm'] = df['asset_nm'].str.replace('-', " ")
  df['asset_nm'] = df['asset_nm'].str.rstrip('.')
  # df['asset_nm'] = df['asset_nm'].str.strip()

  # df3['series_nm'] = df3['asset_nm'].str.replace(r'\d+회$', '', regex=True)
  # df3['series_nm'] = df3['series_nm'].str.replace(r'\d+회\.', '', regex=True) 
  # df3['series_nm'] = df3['series_nm'].str.replace(r'\d+화$', regex = True)

  # use_tms 분 단위로 변경
  df['use_tms'] = round(df['use_tms'] / 60, 1)

  # 셋탑번호가 66056000인 것 삭제 - content 에서 이상치 아이디로 판별
  df = df[df['subsr'] != 66056000]

  return df
df = preprocessing(vod_89)
df.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,use_tms,SMRY,ACTR_DISP,disp_rtm,strt_dt
0,65941000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230812163507
1,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230816205227
2,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,12.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230829194727
3,61689000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230813130609
4,61619000,꼬리에꼬리를무는그날이야기 37회,TV 시사/교양,기타,69.7,"살인범의 미토콘드리아 - 2006 냉동고 살인사건. 2006년 7월 23일, 서울 ...","장도연,장현성,장성규",73,20230804092737


In [17]:
df = preprocessing(vod_89)
df.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,use_tms,SMRY,ACTR_DISP,disp_rtm,strt_dt
0,65941000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230812163507
1,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230816205227
2,66873000,그것이알고싶다 1361회,TV 시사/교양,기타,12.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230829194727
3,61689000,그것이알고싶다 1361회,TV 시사/교양,기타,80.0,살인자의 자백 그리고 아크말의 고백. 방대한 수사기록과 당시 아크말의 진술을 토대로...,김상중,80,20230813130609
4,61619000,꼬리에꼬리를무는그날이야기 37회,TV 시사/교양,기타,69.7,"살인범의 미토콘드리아 - 2006 냉동고 살인사건. 2006년 7월 23일, 서울 ...","장도연,장현성,장성규",73,20230804092737


In [18]:
# 프로그램 회차별 정보
df1 = df[['asset_nm', 'ct_cl', 'genre_of_ct_cl', 'ACTR_DISP', 'disp_rtm']].drop_duplicates().reset_index(drop = True)
df1.head()

Unnamed: 0,asset_nm,ct_cl,genre_of_ct_cl,ACTR_DISP,disp_rtm
0,그것이알고싶다 1361회,TV 시사/교양,기타,김상중,80
1,꼬리에꼬리를무는그날이야기 37회,TV 시사/교양,기타,"장도연,장현성,장성규",73
2,꼬리에꼬리를무는그날이야기 61회,TV 시사/교양,기타,"장도연,장현성,장성규",73
3,인간극장 3338회,TV 시사/교양,기타,명선 스님,32
4,꼬리에꼬리를무는그날이야기 89회,TV 시사/교양,기타,"장도연,장현성,장성규",78


In [19]:
# 유저마다 각 회차를 본 시간을 더함
df2 = pd.DataFrame(df.groupby(['subsr','asset_nm', 'ct_cl', 'genre_of_ct_cl', 'disp_rtm'])['use_tms'].sum()).reset_index()
df2 = df2[(df2['disp_rtm'] != 0) & (df2['use_tms'] != 0)]
df2

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3
...,...,...,...,...,...,...
5705,67148000,타요의 씽씽극장 동요2 10회,키즈,기타,2,2.0
5706,67148000,타요의 씽씽극장 동요2 11회,키즈,기타,2,0.5
5707,67154000,스트릿 우먼 파이터 2 04회,TV 연예/오락,기타,131,1.2
5708,67161000,스파이 코드명 포춘,영화,액션/어드벤쳐,114,0.2


In [20]:
# 시청시간으로 유저가 이 회차를 시청했는지 여부를 결정
## 시청시간을 런타임으로 나눠 0.1 이 넘으면 시청한 것으로 간주
df2['watched'] = df2['use_tms'] / df2['disp_rtm']
df2['watched'] = df2['watched'].apply(lambda x : 1 if x >= 0.2 else 0)
df2.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms,watched
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4,0
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1,1
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6,0
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8,1
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3,0


In [21]:
# 회차 제거
df11 = df1.copy()

df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+회$', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+회\.', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.replace(r'\d+화$', '', regex=True)
df11['asset_nm'] = df11['asset_nm'].str.strip()
df11['asset_nm'] = df11['asset_nm'].str.rstrip('.')
df11

Unnamed: 0,asset_nm,ct_cl,genre_of_ct_cl,ACTR_DISP,disp_rtm
0,그것이알고싶다,TV 시사/교양,기타,김상중,80
1,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",73
2,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",73
3,인간극장,TV 시사/교양,기타,명선 스님,32
4,꼬리에꼬리를무는그날이야기,TV 시사/교양,기타,"장도연,장현성,장성규",78
...,...,...,...,...,...
3850,엄마가 화났다,키즈,학습,-,8
3851,핑크퐁 자동차 동화,키즈,학습,핑크퐁,4
3852,간질간질,키즈,학습,-,6
3853,월간 아기상어,키즈,학습,아기상어,1


In [22]:
# 유저 시청 정보의 회차 제거한 변수 추가
df2['series_nm'] = df2['asset_nm'].str.replace(r'\d+회$', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.replace(r'\d+회\.', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.replace(r'\d+화$', '', regex=True)
df2['series_nm'] = df2['series_nm'].str.strip()
df2['series_nm'] = df2['series_nm'].str.rstrip('.')
df2.head()

Unnamed: 0,subsr,asset_nm,ct_cl,genre_of_ct_cl,disp_rtm,use_tms,watched,series_nm
0,59879000,소방서 옆 경찰서 05회,TV드라마,기타,69,2.4,0,소방서 옆 경찰서
1,59879000,소방서 옆 경찰서 06회,TV드라마,기타,59,22.1,1,소방서 옆 경찰서
2,59879000,신성한 이혼 01회,TV드라마,기타,64,12.6,0,신성한 이혼
3,59879000,신성한 이혼 02회,TV드라마,기타,62,16.8,1,신성한 이혼
4,59879000,신성한 이혼 03회,TV드라마,기타,63,4.3,0,신성한 이혼


In [23]:
df3 = df2[['subsr', 'series_nm', 'ct_cl',	'genre_of_ct_cl', 'watched']].copy()
df3.head()

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched
0,59879000,소방서 옆 경찰서,TV드라마,기타,0
1,59879000,소방서 옆 경찰서,TV드라마,기타,1
2,59879000,신성한 이혼,TV드라마,기타,0
3,59879000,신성한 이혼,TV드라마,기타,1
4,59879000,신성한 이혼,TV드라마,기타,0


In [24]:
# 유저의 시리즈별 시청 횟수
df4 = df3.groupby(['subsr', 'series_nm', 'ct_cl', 'genre_of_ct_cl'])['watched'].sum().reset_index()
df4 = df4[df4['watched']!= 0]
df4

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched
0,59879000,소방서 옆 경찰서,TV드라마,기타,1
1,59879000,신성한 이혼,TV드라마,기타,3
2,59895000,금이야 옥이야,TV드라마,기타,1
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,1
4,59900000,그것이알고싶다,TV 시사/교양,기타,3
...,...,...,...,...,...
2066,67140000,경남 통영 2부,우리동네,연예/오락,1
2067,67140000,밀수,영화,액션/어드벤쳐,1
2068,67140000,잠자는 숲속의 공주,키즈,기타,1
2070,67148000,타요의 씽씽극장 동요2,키즈,기타,11


In [25]:
# 8~9월 시리즈별 시청된 총 횟수
df5 = pd.DataFrame(df11[['asset_nm', 'ct_cl', 'genre_of_ct_cl']].value_counts().reset_index())
df5.columns = ['series_nm', 'ct_cl', 'genre_of_ct_cl', 'watched_all']
df5

Unnamed: 0,series_nm,ct_cl,genre_of_ct_cl,watched_all
0,금이야 옥이야,TV드라마,기타,83
1,연희공략: 건륭황제의여인,TV드라마,외화 시리즈,63
2,TV소설 은희,TV드라마,기타,63
3,인간극장,TV 시사/교양,기타,55
4,런닝맨,TV 연예/오락,기타,54
...,...,...,...,...
1019,봉신연의,영화,액션/어드벤쳐,1
1020,부산,영화,액션/어드벤쳐,1
1021,북 오브 러브,영화,멜로,1
1022,분노의 질주: 라이드 오어 다이,영화,액션/어드벤쳐,1


In [26]:
df6 = df4.merge(df5, on = ['series_nm', 'ct_cl', 'genre_of_ct_cl'], how = 'left')
df6

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched,watched_all
0,59879000,소방서 옆 경찰서,TV드라마,기타,1,11
1,59879000,신성한 이혼,TV드라마,기타,3,12
2,59895000,금이야 옥이야,TV드라마,기타,1,83
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,1,3
4,59900000,그것이알고싶다,TV 시사/교양,기타,3,21
...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,1,1
1495,67140000,밀수,영화,액션/어드벤쳐,1,2
1496,67140000,잠자는 숲속의 공주,키즈,기타,1,1
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,11,13


In [27]:
# 유저의 프로그램 총 시청 횟수
df7 = df6.groupby(['subsr'])['watched'].sum().reset_index()
df7.columns = ['subsr', 'watched_cnt']
df7

Unnamed: 0,subsr,watched_cnt
0,59879000,4
1,59895000,1
2,59900000,8
3,59921000,2
4,59930000,12
...,...,...
327,67117000,10
328,67129000,1
329,67140000,3
330,67148000,11


In [459]:
df8 = df6.merge(df7, on = 'subsr', how = 'left')
df8 = df8[df8['watched_cnt'] != 0].reset_index(drop = True)
df8

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched,watched_all,watched_cnt
0,59879000,소방서 옆 경찰서,TV드라마,기타,1,11,4
1,59879000,신성한 이혼,TV드라마,기타,3,12,4
2,59895000,금이야 옥이야,TV드라마,기타,1,83,1
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,1,3,8
4,59900000,그것이알고싶다,TV 시사/교양,기타,3,21,8
...,...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,1,1,3
1495,67140000,밀수,영화,액션/어드벤쳐,1,2,3
1496,67140000,잠자는 숲속의 공주,키즈,기타,1,1,3
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,11,13,11


In [29]:
# # 시청횟수가 5개 이상인 유저만 
# user_cnt = df8['subsr'].value_counts()
# filter_users = user_cnt[user_cnt >= 2].index
# df8 = df8[df8['subsr'].isin(filter_users)]
# df8

In [726]:
# 시청 여부, 해당 시리즈 총 횟수, 유저의 프로그램 총 시청 횟수를 이용해
# 유저의 프로그램 선호도를 측정
import numpy as np
def scoring(df):
  N = df['watched_all'] # 해당 프로그램 전체 회차 수
  L = df['watched_cnt'] # 유저의 총 시청 프로그램 회차 수
  n = df['watched'] # 유저의 해당 프로그램 시청 수
  lam = np.log(2) / 2
  w1 = 1 - np.exp(-1 * lam * N)
  data = pd.DataFrame([N, L]).T
  def custom_weight(data):
    if data['watched_cnt'] < data['watched_all']:
      return data['watched_all'] / data['watched_cnt']
    else:
      return 1

  w2 = data.apply(custom_weight, axis = 1)

  score = (n / N) * w1 * w2

  return score

In [727]:
df8['score'] = scoring(df8) 
df8

Unnamed: 0,subsr,series_nm,ct_cl,genre_of_ct_cl,watched,watched_all,watched_cnt,score
0,59879000,소방서 옆 경찰서,TV드라마,기타,1,11,4,0.244476
1,59879000,신성한 이혼,TV드라마,기타,3,12,4,0.738281
2,59895000,금이야 옥이야,TV드라마,기타,1,83,1,1.000000
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,1,3,8,0.215482
4,59900000,그것이알고싶다,TV 시사/교양,기타,3,21,8,0.374741
...,...,...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,1,1,3,0.292893
1495,67140000,밀수,영화,액션/어드벤쳐,1,2,3,0.250000
1496,67140000,잠자는 숲속의 공주,키즈,기타,1,1,3,0.292893
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,11,13,11,0.988951


In [728]:
final_df = df8[['subsr', 'series_nm', 'ct_cl', 'genre_of_ct_cl', 'score']]
final_df.columns = ['subsr', 'program', 'main_cat', 'sub_cat', 'score']
final_df = final_df.sort_values(by = 'subsr').reset_index(drop = True)
final_df['category'] = final_df['main_cat'].apply(lambda x : x if x in ['영화', '키즈'] else 'TV프로그램')
final_df['rename'] = final_df['program'].apply(lambda x : x.replace(' ', '') if isinstance(x, str) else x)
final_df

Unnamed: 0,subsr,program,main_cat,sub_cat,score,category,rename
0,59879000,소방서 옆 경찰서,TV드라마,기타,0.244476,TV프로그램,소방서옆경찰서
1,59879000,신성한 이혼,TV드라마,기타,0.738281,TV프로그램,신성한이혼
2,59895000,금이야 옥이야,TV드라마,기타,1.000000,TV프로그램,금이야옥이야
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,0.215482,TV프로그램,2022역사저널그날
4,59900000,그것이알고싶다,TV 시사/교양,기타,0.374741,TV프로그램,그것이알고싶다
...,...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,0.292893,TV프로그램,경남통영2부
1495,67140000,밀수,영화,액션/어드벤쳐,0.250000,영화,밀수
1496,67140000,잠자는 숲속의 공주,키즈,기타,0.292893,키즈,잠자는숲속의공주
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,0.988951,키즈,타요의씽씽극장동요2


In [729]:
final_df.to_csv('../data/vod89.csv', index = 0)

### 모델링

In [730]:
vod = pd.read_csv('../data/vod89.csv')
vod

Unnamed: 0,subsr,program,main_cat,sub_cat,score,category,rename
0,59879000,소방서 옆 경찰서,TV드라마,기타,0.244476,TV프로그램,소방서옆경찰서
1,59879000,신성한 이혼,TV드라마,기타,0.738281,TV프로그램,신성한이혼
2,59895000,금이야 옥이야,TV드라마,기타,1.000000,TV프로그램,금이야옥이야
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,0.215482,TV프로그램,2022역사저널그날
4,59900000,그것이알고싶다,TV 시사/교양,기타,0.374741,TV프로그램,그것이알고싶다
...,...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,0.292893,TV프로그램,경남통영2부
1495,67140000,밀수,영화,액션/어드벤쳐,0.250000,영화,밀수
1496,67140000,잠자는 숲속의 공주,키즈,기타,0.292893,키즈,잠자는숲속의공주
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,0.988951,키즈,타요의씽씽극장동요2


In [731]:
vod_id = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
vod_id = vod_id[['rename', 'Category', 'vod_id']]
vod_id.columns = ['rename', 'category', 'vod_id']
vod_id

Unnamed: 0,rename,category,vod_id
0,와이낫크루뜻밖의여행,TV프로그램,0
1,그레이트뷰티,영화,1
2,해안선,영화,2
3,아는형님,TV프로그램,3
4,베놈,영화,4
...,...,...,...
5019,일루셔니스트,영화,5019
5020,"파리,13구",영화,5020
5021,미쓰백,영화,5021
5022,그녀는거짓말을너무사랑해,영화,5022


In [732]:
vod1 = vod.merge(vod_id, on = ['rename', 'category'], how = 'left')
vod1

Unnamed: 0,subsr,program,main_cat,sub_cat,score,category,rename,vod_id
0,59879000,소방서 옆 경찰서,TV드라마,기타,0.244476,TV프로그램,소방서옆경찰서,2093
1,59879000,신성한 이혼,TV드라마,기타,0.738281,TV프로그램,신성한이혼,1190
2,59895000,금이야 옥이야,TV드라마,기타,1.000000,TV프로그램,금이야옥이야,784
3,59900000,2022 역사저널 그날,TV 시사/교양,기타,0.215482,TV프로그램,2022역사저널그날,572
4,59900000,그것이알고싶다,TV 시사/교양,기타,0.374741,TV프로그램,그것이알고싶다,297
...,...,...,...,...,...,...,...,...
1494,67140000,경남 통영 2부,우리동네,연예/오락,0.292893,TV프로그램,경남통영2부,4398
1495,67140000,밀수,영화,액션/어드벤쳐,0.250000,영화,밀수,2867
1496,67140000,잠자는 숲속의 공주,키즈,기타,0.292893,키즈,잠자는숲속의공주,689
1497,67148000,타요의 씽씽극장 동요2,키즈,기타,0.988951,키즈,타요의씽씽극장동요2,880


In [733]:
vod1.to_csv('../data/vod_score.csv', index = 0)
vod_score = pd.read_csv('../data/vod_score.csv')
vod_score = vod_score[['subsr', 'vod_id', 'score']]
vod_score

Unnamed: 0,subsr,vod_id,score
0,59879000,2093,0.244476
1,59879000,1190,0.738281
2,59895000,784,1.000000
3,59900000,572,0.215482
4,59900000,297,0.374741
...,...,...,...
1494,67140000,4398,0.292893
1495,67140000,2867,0.250000
1496,67140000,689,0.292893
1497,67148000,880,0.988951


In [734]:
# 필요한 Surprise 알고리즘 불러오기
from surprise import SVD, BaselineOnly, SVDpp, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, NMF, KNNWithMeans, KNNBasic
from surprise import accuracy
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# surprise 데이터 형식으로 변환
def convert_traintest_dataframe_forsurprise(training_dataframe):
    reader = Reader(rating_scale=(0, 1)) # 이 범위를 넘으면 양극값으로 대체
    trainset = Dataset.load_from_df(training_dataframe[['subsr', 'vod_id', 'score']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    return trainset

trainset = convert_traintest_dataframe_forsurprise(vod_score)

### KNNBaseline

In [735]:
# from surprise.model_selection import GridSearchCV
# param_grid = {'k' : np.arange(1, 20, 1), 'sim_options' : {'name' : ['pearson_baseline', 'cosine'], 'user_based' : [True, False]}, 'random_state' : [42], 'verbose' : [False]}
            
# gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv = 3)

# reader = Reader(rating_scale=(0,1))
# train_set = Dataset.load_from_df(vod_score[['subsr', 'vod_id', 'score']], reader)

# gs.fit(train_set)
# gs.best_params

In [719]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # itemr-based similarity
bsl_options = {'method' : 'als', 'n_epochs' : 100}
knnbaseline = KNNBaseline(k = 40, sim_options=sim_options, random_state = 42, min_k= 1, bsl_options=bsl_options)
knnbaseline.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x20370bf86d0>

In [720]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbaseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59879000,0,0.396399
1,59879000,3,0.436839
2,59879000,6,0.383902
3,59879000,7,0.370955
4,59879000,8,0.455422
...,...,...,...
270907,67164000,4634,0.406207
270908,67164000,4647,0.402818
270909,67164000,4685,0.400872
270910,67164000,4698,0.402747


In [721]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59879000,1190,880,1166,296,143,1156,1234,848,888,...,724,112,824,884,498,954,830,357,398,739
1,59895000,747,784,738,880,1166,296,143,1156,1234,...,858,720,724,112,824,884,498,954,830,357
2,59900000,296,143,880,1166,1156,111,1234,14,848,...,720,724,112,824,884,498,954,830,357,398
3,59921000,880,1166,296,143,1156,1234,848,888,401,...,112,824,884,498,954,830,357,398,739,1157
4,59930000,728,880,1166,296,143,1156,1234,848,888,...,724,112,824,884,498,954,830,357,398,739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,67117000,818,880,1166,296,143,1156,1234,848,888,...,112,824,884,498,954,830,357,398,739,1157
328,67129000,880,1166,296,143,1156,1234,848,888,401,...,112,824,884,498,954,830,357,398,739,1157
329,67140000,880,1166,296,143,1156,1234,848,888,401,...,112,824,884,498,954,830,357,398,739,1157
330,67148000,880,1166,296,143,1156,1234,848,888,401,...,112,824,884,498,954,830,357,398,739,1157


In [722]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59879000,"[1190, 880, 1166, 296, 143, 1156, 1234, 848, 8..."
1,59895000,"[747, 784, 738, 880, 1166, 296, 143, 1156, 123..."
2,59900000,"[296, 143, 880, 1166, 1156, 111, 1234, 14, 848..."
3,59921000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
4,59930000,"[728, 880, 1166, 296, 143, 1156, 1234, 848, 88..."
...,...,...
327,67117000,"[818, 880, 1166, 296, 143, 1156, 1234, 848, 88..."
328,67129000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
329,67140000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
330,67148000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."


In [723]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59879000,"[1190.0, 880.0, 1166.0, 296.0, 143.0, 1156.0, ..."
1,59895000,"[747.0, 784.0, 738.0, 880.0, 1166.0, 296.0, 14..."
2,59900000,"[296.0, 143.0, 880.0, 1166.0, 1156.0, 111.0, 1..."
3,59921000,"[880.0, 1166.0, 296.0, 143.0, 1156.0, 1234.0, ..."
4,59930000,"[728.0, 880.0, 1166.0, 296.0, 143.0, 1156.0, 1..."
...,...,...
327,67117000,"[818.0, 880.0, 1166.0, 296.0, 143.0, 1156.0, 1..."
328,67129000,"[880.0, 1166.0, 296.0, 143.0, 1156.0, 1234.0, ..."
329,67140000,"[880.0, 1166.0, 296.0, 143.0, 1156.0, 1234.0, ..."
330,67148000,"[880.0, 1166.0, 296.0, 143.0, 1156.0, 1234.0, ..."


In [724]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [725]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict)

0.05925925925925921

In [659]:
temp = vod_predict.iloc[100, 1][:10]

In [709]:
for i in temp:
    print(vod_list[['name', 'vod_id']].iloc[i, 0])

소방서 옆 경찰서 그리고 국과수
뽀로로 인기 동요
천원짜리 변호사
신발 벗고 돌싱포맨
경경아심 : 두근두근 내 마음을 들어봐
부군청자중 : 부군님, 자중하시어요
연희공략: 건륭황제의여인
포청천 포공기안
TV소설 은희
뿌리깊은나무


### SVD, SVDpp

In [792]:
# n_epochs=95, lr_all=0.005, reg_all=0.06, random_state=42 -> 0.045
svd = SVD(n_factors=300, n_epochs=150, lr_all=0.005, reg_all=0.05, random_state=42)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20367d83c10>

In [793]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(svd.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59879000,0,0.559416
1,59879000,3,0.185620
2,59879000,6,0.432697
3,59879000,7,0.446245
4,59879000,8,0.556422
...,...,...,...
270907,67164000,4634,0.584106
270908,67164000,4647,0.343946
270909,67164000,4685,0.200436
270910,67164000,4698,0.335769


In [794]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59879000,816,824,1168,830,2034,991,1370,719,1624,...,2119,2775,2039,821,887,401,2148,3463,757,3678
1,59895000,784,2227,917,2124,23,1156,2784,1551,490,...,618,115,1190,1331,888,1374,174,3331,2344,663
2,59900000,1313,1068,880,2300,117,29,550,1130,3673,...,333,533,974,1166,2227,530,303,3,2812,366
3,59921000,731,976,1207,3167,3826,3680,320,1234,2489,...,3913,1625,476,685,51,3459,249,2245,973,884
4,59930000,728,1834,1156,699,516,1086,14,1157,1519,...,333,679,1313,401,684,1516,2044,316,1168,2245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,67117000,866,4484,1063,818,550,4310,991,1619,241,...,1086,1313,1411,1156,14,1068,1347,2429,296,2886
328,67129000,244,816,4383,1429,368,4325,3243,3126,241,...,694,1166,332,2720,1632,957,1168,3740,2775,3450
329,67140000,731,1234,880,117,410,789,1313,1244,728,...,1156,2135,739,72,3,1606,381,4255,553,686
330,67148000,752,880,1202,731,1193,530,1234,2615,1639,...,3764,3936,486,908,1135,958,3840,778,390,4127


In [795]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59879000,"[816, 824, 1168, 830, 2034, 991, 1370, 719, 16..."
1,59895000,"[784, 2227, 917, 2124, 23, 1156, 2784, 1551, 4..."
2,59900000,"[1313, 1068, 880, 2300, 117, 29, 550, 1130, 36..."
3,59921000,"[731, 976, 1207, 3167, 3826, 3680, 320, 1234, ..."
4,59930000,"[728, 1834, 1156, 699, 516, 1086, 14, 1157, 15..."
...,...,...
327,67117000,"[866, 4484, 1063, 818, 550, 4310, 991, 1619, 2..."
328,67129000,"[244, 816, 4383, 1429, 368, 4325, 3243, 3126, ..."
329,67140000,"[731, 1234, 880, 117, 410, 789, 1313, 1244, 72..."
330,67148000,"[752, 880, 1202, 731, 1193, 530, 1234, 2615, 1..."


In [796]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59879000,"[816, 824, 1168, 830, 2034, 991, 1370, 719, 16..."
1,59895000,"[784, 2227, 917, 2124, 23, 1156, 2784, 1551, 4..."
2,59900000,"[1313, 1068, 880, 2300, 117, 29, 550, 1130, 36..."
3,59921000,"[731, 976, 1207, 3167, 3826, 3680, 320, 1234, ..."
4,59930000,"[728, 1834, 1156, 699, 516, 1086, 14, 1157, 15..."
...,...,...
327,67117000,"[866, 4484, 1063, 818, 550, 4310, 991, 1619, 2..."
328,67129000,"[244, 816, 4383, 1429, 368, 4325, 3243, 3126, ..."
329,67140000,"[731, 1234, 880, 117, 410, 789, 1313, 1244, 72..."
330,67148000,"[752, 880, 1202, 731, 1193, 530, 1234, 2615, 1..."


In [797]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [798]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.036419753086419746

### BaselineOnly

In [623]:
bsl_options = {'method' : 'als', 'n_epochs' : 60}
baseline = BaselineOnly(bsl_options=bsl_options)
baseline.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x230b3451e90>

In [624]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(baseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59879000,0,0.386975
1,59879000,3,0.426753
2,59879000,6,0.373757
3,59879000,7,0.360225
4,59879000,8,0.452345
...,...,...,...
270907,67164000,4634,0.395563
270908,67164000,4647,0.392065
270909,67164000,4685,0.390043
270910,67164000,4698,0.391973


In [625]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59879000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
1,59895000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
2,59900000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
3,59921000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
4,59930000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,67117000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
328,67129000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
329,67140000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157
330,67148000,880,1166,296,143,1156,1234,848,888,401,...,824,8,112,498,884,954,830,357,398,1157


In [626]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59879000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
1,59895000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
2,59900000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
3,59921000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
4,59930000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
...,...,...
327,67117000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
328,67129000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
329,67140000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."
330,67148000,"[880, 1166, 296, 143, 1156, 1234, 848, 888, 40..."


In [620]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59879000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
1,59895000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
2,59900000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
3,59921000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
4,59930000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
...,...,...
327,67117000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
328,67129000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
329,67140000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."
330,67148000,"[880, 1156, 1166, 848, 1234, 296, 143, 888, 40..."


In [621]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [622]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.02037037037037038

### KNNBasic

In [263]:
sim_options = {'name': 'cosine', 'user_based': True} # itemr-based similarity
knnbasic = KNNBasic(k = 40, sim_options=sim_options, random_state = 42)
knnbasic.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2035b721050>

In [264]:
user_id = sorted(vod_score.subsr.unique())
vod_id = sorted(vod_score.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbasic.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59879000,0,0.364483
1,59879000,3,0.364483
2,59879000,6,0.292893
3,59879000,7,0.364483
4,59879000,8,0.364483
...,...,...,...
149167,67140000,4634,0.364483
149168,67140000,4647,0.364483
149169,67140000,4685,0.364483
149170,67140000,4698,0.364483


In [247]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59879000,1190,0,2444,2382,2384,2387,2390,2396,2404,...,2495,2496,2501,2515,2364,2357,2356,2353,2218,2223
1,59900000,38,13,289,296,143,835,1156,245,8,...,2468,2424,2425,2429,2357,2444,2457,2462,2483,2433
2,59921000,0,2356,2364,2382,2384,2387,2390,2396,2404,...,2495,2496,2501,2357,2353,2518,2352,2148,2170,2218
3,59930000,728,456,0,2429,2382,2384,2387,2390,2396,...,2489,2495,2496,2501,2364,2357,2356,2353,2218,2223
4,60050000,360,0,2444,2382,2384,2387,2390,2396,2404,...,2495,2496,2501,2515,2364,2356,2979,2353,2218,2223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,67000000,0,2444,2382,2384,2387,2390,2396,2404,2424,...,2496,2501,2515,2518,2364,2357,2356,2353,2173,2218
182,67008000,0,2364,2384,2387,2390,2396,2404,2424,2425,...,2501,2515,2518,2382,2357,2539,2356,2173,2218,2223
183,67055000,315,1156,0,2425,2357,2364,2382,2384,2387,...,2469,2483,2489,2495,2356,2352,2141,2344,2170,2173
184,67107000,0,2343,2352,2353,2356,2357,2364,2382,2384,...,2468,2469,2483,2344,2338,2495,2323,2135,2140,2141


In [248]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59879000,"[1190, 0, 2444, 2382, 2384, 2387, 2390, 2396, ..."
1,59900000,"[38, 13, 289, 296, 143, 835, 1156, 245, 8, 111..."
2,59921000,"[0, 2356, 2364, 2382, 2384, 2387, 2390, 2396, ..."
3,59930000,"[728, 456, 0, 2429, 2382, 2384, 2387, 2390, 23..."
4,60050000,"[360, 0, 2444, 2382, 2384, 2387, 2390, 2396, 2..."
...,...,...
181,67000000,"[0, 2444, 2382, 2384, 2387, 2390, 2396, 2404, ..."
182,67008000,"[0, 2364, 2384, 2387, 2390, 2396, 2404, 2424, ..."
183,67055000,"[315, 1156, 0, 2425, 2357, 2364, 2382, 2384, 2..."
184,67107000,"[0, 2343, 2352, 2353, 2356, 2357, 2364, 2382, ..."


In [249]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59879000,"[1190, 0, 2444, 2382, 2384, 2387, 2390, 2396, ..."
1,59900000,"[38, 13, 289, 296, 143, 835, 1156, 245, 8, 111..."
2,59921000,"[0, 2356, 2364, 2382, 2384, 2387, 2390, 2396, ..."
3,59930000,"[728, 456, 0, 2429, 2382, 2384, 2387, 2390, 23..."
4,60050000,"[360, 0, 2444, 2382, 2384, 2387, 2390, 2396, 2..."
...,...,...
181,67000000,"[0, 2444, 2382, 2384, 2387, 2390, 2396, 2404, ..."
182,67008000,"[0, 2364, 2384, 2387, 2390, 2396, 2404, 2424, ..."
183,67055000,"[315, 1156, 0, 2425, 2357, 2364, 2382, 2384, 2..."
184,67107000,"[0, 2343, 2352, 2353, 2356, 2357, 2364, 2382, ..."


In [250]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [251]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict, K = 10)

0.0704918032786885

### 앙상블적용

In [799]:
knnbaseline_predict = []
svd_predict = []
for user in user_id:
    for vod in vod_id:
        knnbaseline_predict.append(knnbaseline.predict(user, vod)[0:4])
        svd_predict.append(svd.predict(user, vod)[0:4])
        
knnbaseline_predict = pd.DataFrame(knnbaseline_predict, columns = ['subsr', 'vod_id', 'real', 'predict'])
knnbaseline_predict = knnbaseline_predict[['subsr', 'vod_id', 'predict']]

svd_predict = pd.DataFrame(svd_predict, columns = ['subsr', 'vod_id', 'real', 'predict'])
svd_predict = svd_predict[['subsr', 'vod_id', 'predict']]

In [800]:
result = svd_predict.merge(knnbaseline_predict, how = 'left', on = ['subsr', 'vod_id'])
result['predict'] = (result['predict_x'] + result['predict_y']) / 2
result = result[['subsr' , 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59879000,0,0.477907
1,59879000,3,0.311230
2,59879000,6,0.408300
3,59879000,7,0.408600
4,59879000,8,0.505922
...,...,...,...
270907,67164000,4634,0.495156
270908,67164000,4647,0.373382
270909,67164000,4685,0.300654
270910,67164000,4698,0.369258


In [801]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:30].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,59879000,1190,816,824,830,1168,991,1166,719,2034,...,888,818,2775,695,1606,441,296,757,984,887
1,59895000,784,747,2227,1156,880,1166,1234,738,1551,...,296,2559,788,3441,2058,858,2462,1523,1374,1190
2,59900000,880,1313,1068,1166,29,117,1130,550,296,...,530,4105,61,303,848,4325,533,143,3,858
3,59921000,731,1166,880,976,1234,1207,3167,320,3826,...,441,249,1927,685,2245,816,1176,973,3913,143
4,59930000,728,1156,1834,14,699,1086,1157,516,1166,...,720,888,700,498,1168,4490,731,244,1516,2245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,67117000,818,866,1063,4484,848,550,1234,296,991,...,1347,1086,3441,1313,1669,1068,3719,992,824,143
328,67129000,816,244,1166,368,4383,1156,1429,241,1234,...,2775,2227,2135,719,1347,448,1632,2720,332,1346
329,67140000,880,1234,731,117,1156,410,789,1313,1244,...,3,4308,4698,1435,2518,381,2135,72,143,1606
330,67148000,880,1234,752,731,530,1202,720,1639,1193,...,3840,908,1135,976,822,163,410,3936,849,3764


In [802]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59879000,"[1190, 816, 824, 830, 1168, 991, 1166, 719, 20..."
1,59895000,"[784, 747, 2227, 1156, 880, 1166, 1234, 738, 1..."
2,59900000,"[880, 1313, 1068, 1166, 29, 117, 1130, 550, 29..."
3,59921000,"[731, 1166, 880, 976, 1234, 1207, 3167, 320, 3..."
4,59930000,"[728, 1156, 1834, 14, 699, 1086, 1157, 516, 11..."
...,...,...
327,67117000,"[818, 866, 1063, 4484, 848, 550, 1234, 296, 99..."
328,67129000,"[816, 244, 1166, 368, 4383, 1156, 1429, 241, 1..."
329,67140000,"[880, 1234, 731, 117, 1156, 410, 789, 1313, 12..."
330,67148000,"[880, 1234, 752, 731, 530, 1202, 720, 1639, 11..."


In [803]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_score[vod_score.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
vod_predict_1

Unnamed: 0,subsr,vod_id
0,59879000,"[1190, 816, 824, 830, 1168, 991, 1166, 719, 20..."
1,59895000,"[784, 747, 2227, 1156, 880, 1166, 1234, 738, 1..."
2,59900000,"[880, 1313, 1068, 1166, 29, 117, 1130, 550, 29..."
3,59921000,"[731, 1166, 880, 976, 1234, 1207, 3167, 320, 3..."
4,59930000,"[728, 1156, 1834, 14, 699, 1086, 1157, 516, 11..."
...,...,...
327,67117000,"[818, 866, 1063, 4484, 848, 550, 1234, 296, 99..."
328,67129000,"[816, 244, 1166, 368, 4383, 1156, 1429, 241, 1..."
329,67140000,"[880, 1234, 731, 117, 1156, 410, 789, 1313, 12..."
330,67148000,"[880, 1234, 752, 731, 530, 1202, 720, 1639, 11..."


In [804]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[200, 2415, 2869, 2546, 213, 2291, 2610, 4880]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [805]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

precision_k(testdata, vod_predict)

0.04753086419753083