In [680]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action= 'ignore')

In [681]:
vod_heavy_id = pd.read_csv('../data/vod89_heavy.csv')
vod_medium_id = pd.read_csv('../data/vod89_medium.csv')
vod_light_id = pd.read_csv('../data/vod89_light.csv')

In [682]:
# 필요한 Surprise 알고리즘 불러오기
from surprise import SVD, BaselineOnly, SVDpp, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, NMF, KNNWithMeans, KNNBasic
from surprise import accuracy
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# surprise 데이터 형식으로 변환
def convert_traintest_dataframe_forsurprise(training_dataframe):
    reader = Reader(rating_scale=(0, 1)) # 이 범위를 넘으면 양극값으로 대체
    trainset = Dataset.load_from_df(training_dataframe[['subsr', 'vod_id', 'score']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    return trainset

trainset = convert_traintest_dataframe_forsurprise(vod_heavy_id)

In [683]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # item-based similarity
bsl_options = {'method' : 'sgd', 'n_epochs' : 1}
knnbaseline = KNNBaseline(k = 40, sim_options=sim_options, random_state = 42, min_k= 1, 
                          bsl_options=bsl_options)

knnbaseline.fit(trainset)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x24715b47150>

In [684]:
user_id = sorted(vod_heavy_id.subsr.unique())
vod_id = sorted(vod_heavy_id.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbaseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59930000,3,0.363466
1,59930000,6,0.363044
2,59930000,8,0.353509
3,59930000,12,0.360740
4,59930000,13,0.357201
...,...,...,...
33920,67055000,4545,0.353366
33921,67055000,4553,0.353365
33922,67055000,4629,0.353365
33923,67055000,4647,0.353357


In [685]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:25].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
0,59930000,728,456,1156,1347,111,888,14,398,884,...,754,858,114,816,356,789,1506,954,719,1639
1,60067000,954,976,992,849,1136,2404,810,884,1347,...,720,726,724,1395,1077,754,858,114,816,356
2,60224000,410,835,1166,256,508,1886,784,747,1178,...,398,884,848,720,726,724,1395,1077,754,858
3,60326000,1639,887,1086,381,2041,1347,778,884,2357,...,303,162,1785,2045,1156,111,888,14,398,848
4,60463000,699,690,1156,1347,111,888,14,398,884,...,754,858,114,816,356,789,1506,954,719,1639
5,60506000,1156,1347,111,888,14,398,884,848,720,...,114,816,356,789,1506,954,719,1639,796,1086
6,60510000,1156,1347,111,888,14,398,884,848,720,...,114,816,356,789,1506,954,719,1639,796,1086
7,60593000,1506,14,888,1156,835,30,111,1063,861,...,256,538,13,1278,245,1347,38,72,398,884
8,60939000,1156,1347,111,888,14,398,884,848,720,...,114,816,356,789,1506,954,719,1639,796,1086
9,60955000,738,958,842,542,289,1156,1886,1347,111,...,726,724,1395,1077,754,858,114,816,356,789


In [686]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59930000,"[728, 456, 1156, 1347, 111, 888, 14, 398, 884,..."
1,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
2,60224000,"[410, 835, 1166, 256, 508, 1886, 784, 747, 117..."
3,60326000,"[1639, 887, 1086, 381, 2041, 1347, 778, 884, 2..."
4,60463000,"[699, 690, 1156, 1347, 111, 888, 14, 398, 884,..."
5,60506000,"[1156, 1347, 111, 888, 14, 398, 884, 848, 720,..."
6,60510000,"[1156, 1347, 111, 888, 14, 398, 884, 848, 720,..."
7,60593000,"[1506, 14, 888, 1156, 835, 30, 111, 1063, 861,..."
8,60939000,"[1156, 1347, 111, 888, 14, 398, 884, 848, 720,..."
9,60955000,"[738, 958, 842, 542, 289, 1156, 1886, 1347, 11..."


In [687]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_heavy_id[vod_heavy_id.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
result_vod_heavy = vod_predict_1.copy()

In [688]:
trainset = convert_traintest_dataframe_forsurprise(vod_medium_id)

In [689]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # item-based similarity
bsl_options = {'method' : 'sgd', 'n_epochs' : 1}
knnbaseline = KNNBaseline(k = 40, sim_options=sim_options, random_state = 42, min_k= 1, 
                          bsl_options=bsl_options)

knnbaseline.fit(trainset)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x24715d07990>

In [690]:
user_id = sorted(vod_medium_id.subsr.unique())
vod_id = sorted(vod_medium_id.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbaseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59900000,3,0.362002
1,59900000,6,0.361591
2,59900000,7,0.361260
3,59900000,8,0.600165
4,59900000,12,0.359262
...,...,...,...
66224,67140000,4553,0.360266
66225,67140000,4627,0.360248
66226,67140000,4629,0.360262
66227,67140000,4647,0.360259


In [691]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:25].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
0,59900000,292,3028,531,256,16,245,38,289,432,...,848,1347,111,888,14,1077,398,884,1166,720
1,59930000,728,456,1156,848,1347,111,888,14,1077,...,726,1395,830,43,32,858,739,816,356,114
2,60050000,360,1156,848,1347,111,888,14,1077,398,...,1395,830,43,32,858,739,816,356,114,51
3,60067000,954,976,992,849,1136,2404,810,884,1347,...,398,1166,720,754,724,726,1395,830,43,32
4,60169000,1156,848,1347,111,888,14,1077,398,884,...,830,43,32,858,739,816,356,114,51,789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,66900000,848,1156,1347,888,1077,398,884,1166,720,...,32,858,739,816,356,114,51,789,1639,954
99,67000000,1156,848,1347,111,888,14,1077,398,884,...,830,43,32,858,739,816,356,114,51,789
100,67008000,1156,848,1347,111,888,14,1077,398,884,...,830,43,32,858,739,816,356,114,51,789
101,67055000,315,1156,848,1347,888,1077,398,884,1166,...,43,32,858,739,816,356,114,51,789,1639


In [692]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59900000,"[292, 3028, 531, 256, 16, 245, 38, 289, 432, 2..."
1,59930000,"[728, 456, 1156, 848, 1347, 111, 888, 14, 1077..."
2,60050000,"[360, 1156, 848, 1347, 111, 888, 14, 1077, 398..."
3,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
4,60169000,"[1156, 848, 1347, 111, 888, 14, 1077, 398, 884..."
...,...,...
98,66900000,"[848, 1156, 1347, 888, 1077, 398, 884, 1166, 7..."
99,67000000,"[1156, 848, 1347, 111, 888, 14, 1077, 398, 884..."
100,67008000,"[1156, 848, 1347, 111, 888, 14, 1077, 398, 884..."
101,67055000,"[315, 1156, 848, 1347, 888, 1077, 398, 884, 11..."


In [693]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_medium_id[vod_medium_id.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
result_vod_medium = vod_predict_1.copy()

In [694]:
trainset = convert_traintest_dataframe_forsurprise(vod_light_id)

In [695]:
sim_options = {'name': 'pearson_baseline', 'user_based': False} # item-based similarity
bsl_options = {'method' : 'sgd', 'n_epochs' : 1}
knnbaseline = KNNBaseline(k = 40, sim_options=sim_options, random_state = 42, min_k= 1, 
                          bsl_options=bsl_options)

knnbaseline.fit(trainset)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x247174acad0>

In [696]:
user_id = sorted(vod_light_id.subsr.unique())
vod_id = sorted(vod_light_id.vod_id.unique())

result = []
for user in user_id:
    for vod in vod_id:
        result.append(knnbaseline.predict(user, vod)[0:4])

result = pd.DataFrame(result, columns = ['subsr', 'vod_id', 'real', 'predict'])
result = result[['subsr', 'vod_id', 'predict']]
result

Unnamed: 0,subsr,vod_id,predict
0,59895000,3,0.399132
1,59895000,6,0.398205
2,59895000,7,0.395673
3,59895000,8,0.404826
4,59895000,10,0.398719
...,...,...,...
204802,67164000,4627,0.396225
204803,67164000,4629,0.396242
204804,67164000,4647,0.396229
204805,67164000,4685,0.396226


In [697]:
result_1 = []
for user in user_id:
    result_1.append(result[result['subsr'] == user].sort_values(by = 'predict', ascending=False).vod_id[:25].tolist())

result_1 = pd.DataFrame(result_1)
result_1 = pd.concat([pd.DataFrame(user_id, columns = ['subsr']), result_1], axis = 1)
result_1

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,...,15,16,17,18,19,20,21,22,23,24
0,59895000,747,738,784,1886,1166,143,296,880,1156,...,336,111,818,41,554,405,1077,398,884,14
1,59900000,296,8,16,256,531,292,289,38,3028,...,1166,143,880,1156,888,1234,848,401,1347,336
2,59930000,728,456,1166,143,296,880,1156,888,1234,...,818,41,554,405,1077,398,884,14,16,720
3,59933000,143,296,8,256,531,292,36,861,48,...,297,44,1166,880,1156,888,1234,848,401,1347
4,60040000,1166,143,296,880,1156,888,1234,848,8,...,554,405,1077,398,884,14,16,720,754,726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,67107000,1166,143,296,880,1156,888,1234,848,8,...,554,405,1077,398,884,14,16,720,754,726
289,67117000,818,1166,143,296,880,1156,888,1234,848,...,554,405,1077,398,884,14,16,720,754,726
290,67140000,1166,143,296,880,1156,888,1234,848,8,...,554,405,1077,398,884,14,16,720,754,726
291,67148000,880,1166,143,296,1156,888,1234,848,8,...,554,405,1077,398,884,14,16,720,754,726


In [698]:
vod_predict = result_1.copy()
vod_predict.index = user_id
vod_predict = vod_predict.apply(lambda x : x[1:].tolist(), axis = 1)
vod_predict = vod_predict.reset_index()
vod_predict.columns = ['subsr', 'vod_id']
vod_predict

Unnamed: 0,subsr,vod_id
0,59895000,"[747, 738, 784, 1886, 1166, 143, 296, 880, 115..."
1,59900000,"[296, 8, 16, 256, 531, 292, 289, 38, 3028, 542..."
2,59930000,"[728, 456, 1166, 143, 296, 880, 1156, 888, 123..."
3,59933000,"[143, 296, 8, 256, 531, 292, 36, 861, 48, 72, ..."
4,60040000,"[1166, 143, 296, 880, 1156, 888, 1234, 848, 8,..."
...,...,...
288,67107000,"[1166, 143, 296, 880, 1156, 888, 1234, 848, 8,..."
289,67117000,"[818, 1166, 143, 296, 880, 1156, 888, 1234, 84..."
290,67140000,"[1166, 143, 296, 880, 1156, 888, 1234, 848, 8,..."
291,67148000,"[880, 1166, 143, 296, 1156, 888, 1234, 848, 8,..."


In [699]:
# 추천 VOD가 영화인 경우, 본 적이 있다면 추천안함
vod_list = pd.read_csv('../data/vod_list_add10_1220.csv', index_col=0)
TV_kids = vod_list[(vod_list['Category'] == 'TV프로그램') | (vod_list['Category'] == '키즈')].vod_id.unique().tolist()
movie = vod_list[vod_list['Category'] == '영화'].vod_id.unique().tolist()

vod_predict_1 = pd.DataFrame()
for user in user_id:
    li = []
    watched_list = vod_light_id[vod_light_id.subsr == user].vod_id.tolist()
    rec_list = vod_predict[vod_predict.subsr == user].vod_id.tolist()[0]
    for x in rec_list:
        if x not in watched_list:
            li.append(x)
        elif x in watched_list and x in TV_kids:
            li.append(x)
        elif x in watched_list and x in movie:
            continue
    
    vod_predict_1 = pd.concat([vod_predict_1, pd.DataFrame(li).T], axis= 0)

vod_predict_1.index = user_id
vod_predict_1 = vod_predict_1.apply(lambda x : x.tolist(), axis = 1)
vod_predict_1 = vod_predict_1.reset_index()
vod_predict_1.columns = ['subsr', 'vod_id']
result_vod_light = vod_predict_1.copy()

In [700]:
result_content = pd.read_csv('../data/semi_con_ensemble.csv')

In [701]:
result_content

Unnamed: 0,subsr,0,1,2,3,4,5,6,7,8,9
0,59879000,2537,2288,2865,2057,296,2092,2608,1166,538,143
1,59882000,1233,296,3026,538,143,888,2867,1877,581,240
2,59886000,3891,2537,2288,2865,2057,296,2608,1166,538,143
3,59890000,2716,2537,2288,2865,2057,296,2608,1166,538,143
4,59892000,2288,2865,2057,2608,538,143,888,2867,2633,2630
...,...,...,...,...,...,...,...,...,...,...,...
894,67149000,297,2226,610,991,538,143,726,1877,1233,581
895,67154000,476,2537,2288,2865,2057,296,2608,1166,538,143
896,67160000,2537,2288,2865,296,2608,1166,538,888,2867,2633
897,67161000,3824,2537,2288,2865,2057,296,2608,1166,538,143


In [702]:
medium_users = set(result_vod_medium.subsr.values) - set(result_vod_heavy.subsr.values)
predict_vod_medium = result_vod_medium[result_vod_medium.subsr.isin(medium_users)].reset_index(drop = True)
light_users = set(result_vod_light.subsr.values) - set(result_vod_medium.subsr.values)
predict_vod_light = result_vod_light[result_vod_light.subsr.isin(light_users)].reset_index(drop = True)

In [703]:
content_user_only = set(result_content.subsr.values) - set(result_vod_light.subsr.values)
result_content_only = result_content[result_content.subsr.isin(content_user_only)]
result_content_only.index = result_content_only.subsr
result_content_only = result_content_only.apply(lambda x : x[1:].tolist(), axis = 1).reset_index()
result_content_only.columns = ['subsr', 'vod_id']

predict_light = pd.concat([result_content_only, predict_vod_light]).reset_index(drop = True)

In [704]:
predict_all = pd.concat([result_vod_heavy, predict_vod_medium, predict_light])
predict_all

Unnamed: 0,subsr,vod_id
0,59930000,"[728, 456, 1156, 1347, 111, 888, 14, 398, 884,..."
1,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
2,60224000,"[410, 835, 1166, 256, 508, 1886, 784, 747, 117..."
3,60326000,"[1639, 887, 1086, 381, 2041, 1347, 778, 884, 2..."
4,60463000,"[699, 690, 1156, 1347, 111, 888, 14, 398, 884,..."
...,...,...
808,67106000,"[55.0, 1166.0, 143.0, 296.0, 880.0, 1156.0, 88..."
809,67107000,"[1166.0, 143.0, 296.0, 880.0, 1156.0, 888.0, 1..."
810,67117000,"[818.0, 1166.0, 143.0, 296.0, 880.0, 1156.0, 8..."
811,67148000,"[880.0, 1166.0, 143.0, 296.0, 1156.0, 888.0, 1..."


In [705]:
testdata = pd.read_csv('../data/watched_vod_10.csv', index_col=0)
testdata = testdata.groupby('subsr')['vod_id'].unique().reset_index()
testdata

Unnamed: 0,subsr,vod_id
0,59900000,"[1278, 1885]"
1,59930000,[2098]
2,59933000,[296]
3,60050000,"[2610, 2291, 213, 4880, 2869, 2415, 200, 2546]"
4,60067000,"[1785, 887, 1347, 4773, 4782, 4759, 992, 3806,..."
...,...,...
215,67133000,[296]
216,67164000,"[4956, 1434, 4995, 2058]"
217,67170000,"[3464, 948, 3900, 675]"
218,67202000,[1028]


In [706]:
# precision@k
def precision_k(testdata, recommended_data, K = 10):
    merge_df = pd.merge(testdata, recommended_data, on='subsr', how = 'left', suffixes=('_actual', '_rec'))
    merge_df = merge_df.dropna()
    precision_value = 0
    for i in range(merge_df.shape[0]):
        temp = len(set(merge_df.iloc[i,1]).intersection(set(merge_df.iloc[i, 2][:K]))) / K
        precision_value += temp
        
    return precision_value / merge_df.shape[0]

In [707]:
predict_all = pd.concat([result_vod_heavy, predict_vod_medium, predict_light])
predict_all

Unnamed: 0,subsr,vod_id
0,59930000,"[728, 456, 1156, 1347, 111, 888, 14, 398, 884,..."
1,60067000,"[954, 976, 992, 849, 1136, 2404, 810, 884, 134..."
2,60224000,"[410, 835, 1166, 256, 508, 1886, 784, 747, 117..."
3,60326000,"[1639, 887, 1086, 381, 2041, 1347, 778, 884, 2..."
4,60463000,"[699, 690, 1156, 1347, 111, 888, 14, 398, 884,..."
...,...,...
808,67106000,"[55.0, 1166.0, 143.0, 296.0, 880.0, 1156.0, 88..."
809,67107000,"[1166.0, 143.0, 296.0, 880.0, 1156.0, 888.0, 1..."
810,67117000,"[818.0, 1166.0, 143.0, 296.0, 880.0, 1156.0, 8..."
811,67148000,"[880.0, 1166.0, 143.0, 296.0, 1156.0, 888.0, 1..."


In [708]:
print(precision_k(testdata, result_vod_heavy, K = 10))
print(precision_k(testdata, predict_vod_medium, K = 10))
print(precision_k(testdata, predict_light, K = 10))
print(precision_k(testdata, predict_all, K = 10))
print()
print(result_vod_heavy.shape)
print(predict_vod_medium.shape)
print(predict_light.shape)

0.10408163265306124
0.05161290322580645
0.033057851239669436
0.05323383084577109

(59, 2)
(44, 2)
(813, 2)
