In [128]:
!pip install -q --upgrade tensorflow-recommenders tensorflow-datasets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

user-item matrix 생성

row : user  
column : movie

rate(시청한 것) : 1, not rate(시청하지 않은 것) : 0

유저가 평점 매긴 것은 본 것으로 간주 -> 잘 안 나옴 / 일정 이상 보지 않은 것은 보지 않은 것으로 간주

In [2]:
ratings = pd.read_csv('ratings.csv', encoding='utf-8')
vod_list = pd.read_csv('vod_89_add_id.csv', encoding='utf-8')

In [3]:
def create_user_item_matrix(ratings) -> pd.DataFrame:
    mat = ratings.pivot(index='subsr', columns='vod_id', values='rating')
    def transform_value(x):
        # 유저가 20% 이하로 시청한 vod는 보지 않은 것, 20% 초과로 본 vod는 본 것으로 간주
        if pd.isna(x) or x <= 0.5:
            return float(0)
        else:
            return float(1)
    # 유저가 평점 매기지 않은 것과 평점 2점 이하는 보지 않은 것으로 간주 - 0
    mat = mat.applymap(transform_value)
    return mat

user_item_mat = create_user_item_matrix(ratings)

In [4]:
user_item_mat.info()  # 희소행렬

<class 'pandas.core.frame.DataFrame'>
Index: 416 entries, 59879000 to 67164000
Columns: 4155 entries, 0 to 4154
dtypes: float64(4155)
memory usage: 13.2 MB


SVD(singular vector decomposition)

In [5]:
def get_svd_prediction(user_item_matrix, k):
    # U, sigma, V 추출
    u, s, vh = scipy.sparse.linalg.svds(user_item_matrix.to_numpy(), k=k)
    # u,s,vh 곱해서 원본 행렬 복원
    preds = np.dot(np.dot(u, np.diag(s)), vh)
    
    # 결과를 DataFrame으로 변환, 정규화
    preds = pd.DataFrame(preds, columns=user_item_matrix.columns, index=user_item_matrix.index)
    preds = (preds - preds.min()) / (preds.max() - preds.min())
    return preds

predictions = get_svd_prediction(user_item_mat, k=64)
predictions

vod_id,0,1,2,3,4,5,6,7,8,9,...,4145,4146,4147,4148,4149,4150,4151,4152,4153,4154
subsr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59879000,0.000001,0.385902,0.963577,0.019149,0.074301,0.395060,0.060972,0.000001,0.001544,0.878132,...,0.001145,0.011129,0.014366,0.665190,0.014366,0.376569,,,,
59895000,0.000001,0.386008,0.963594,0.019119,0.074202,0.395196,0.060939,0.000001,0.001544,0.877813,...,0.001127,0.011098,0.014313,0.665370,0.014313,0.375039,,,,
59900000,0.000001,0.380816,0.963517,0.012862,0.059425,0.372166,0.042348,0.000001,0.001544,0.899353,...,0.002317,0.012291,0.010147,0.429556,0.010147,0.301235,,,,
59921000,0.000001,0.385902,0.963577,0.019149,0.074301,0.395060,0.060972,0.000001,0.001544,0.878132,...,0.001145,0.011129,0.014366,0.665190,0.014366,0.376569,,,,
59930000,0.000001,0.315847,0.963524,0.018785,1.000000,0.427832,0.062099,0.000001,0.001555,0.884174,...,0.001186,0.011693,0.013479,0.638636,0.013479,0.328436,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67140000,0.000001,0.385951,0.963562,0.019149,0.074304,0.387952,0.060971,0.000001,0.158096,0.877261,...,0.001145,0.011129,0.014366,0.665189,0.014366,0.376573,,,,
67148000,0.000001,0.385902,0.382363,0.019149,0.074301,0.394405,0.060972,0.000001,0.001544,0.877781,...,0.001145,0.011129,0.014366,0.665190,0.014366,0.376569,,,,
67154000,0.000001,0.385902,0.963577,0.019149,0.074301,0.395060,0.060972,0.000001,0.001544,0.878132,...,0.001145,0.011129,0.014366,0.665190,0.014366,0.376569,,,,
67161000,0.000001,0.385902,0.963577,0.019149,0.074301,0.395060,0.060972,0.000001,0.001544,0.878132,...,0.001145,0.011129,0.014366,0.665190,0.014366,0.376569,,,,


추천 확인

In [6]:
# 많이 이용한 순으로 유저 목록
ratings.groupby(['subsr']).count().sort_values(by='asset_nm', ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 0,asset_nm,rating,stars,vod_id
subsr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
66875000,442,442,442,442,442
60326000,358,358,358,358,358
64154000,245,245,245,245,245
64385000,220,220,220,220,220
66705000,210,210,210,210,210
60593000,150,150,150,150,150
64130000,150,150,150,150,150
60067000,145,145,145,145,145
64984000,123,123,123,123,123
65557000,105,105,105,105,105


In [7]:
# 임의의 유저 시청 목록
subsr = 61683000
subsr_vod_ids = ratings[ratings.subsr == subsr].vod_id  # vod_id 목록
subsr_vod_ids
subsr_vods = vod_list[vod_list.vod_id.isin(subsr_vod_ids)]
print(subsr, '유저가 본 vod 목록')
print(len(subsr_vods), subsr_vods)
print()
# 복원된 행렬에서 유저 row 가져온 후 내림차순으로 정렬
subsr_predictions = predictions.loc[subsr].sort_values(ascending=False)
# 유저가 이미 본 영화는 제외
subsr_predictions = subsr_predictions[~subsr_predictions.index.isin(subsr_vod_ids)]
# 상위 10개 vod 추출
subsr_predictions = subsr_predictions.head(10)
# 10개 vod 정보 가져옴
subsr_recommendations = vod_list[vod_list.vod_id.isin(subsr_predictions.index)]
subsr_recommendations["recommendation_score"] = subsr_predictions.values
print("유저가 보지 않았고 추천할 영화들")
print(subsr_recommendations)
print(subsr_predictions)

61683000 유저가 본 vod 목록
82       Unnamed: 0  vod_id                       asset_nm     ct_cl  \
23            23      23         놀면 뭐하니? 198회(23/07/29)  TV 연예/오락   
70            70      70         (HD)런닝맨 665회(23/07/30)  TV 연예/오락   
248          248     248    오은영 리포트 결혼 지옥 49회(23/07/31)  TV 시사/교양   
279          279     279    (HD)그것이알고싶다 1354회(23/06/03)  TV 시사/교양   
356          356     356      (HD)나 혼자산다 505회(23/07/28)  TV 연예/오락   
...          ...     ...                            ...       ...   
3733        3733    3733             홍김동전 54회(23/09/21)  TV 연예/오락   
3767        3767    3767     (HD)궁금한이야기Y 655회(23/09/22)  TV 연예/오락   
3940        3940    3940    (HD)그것이알고싶다 1370회(23/09/23)  TV 시사/교양   
3953        3953    3953  동상이몽 2-너는 내 운명 310회(23/09/25)  TV 연예/오락   
3961        3961    3961     (HD)TV동물농장 1138회(23/09/17)  TV 연예/오락   

     genre_of_ct_cl                                               SMRY  \
23               기타  청순함(?) 흩날리는 보디빌더학과 이경. 한국어가 주전공 영문학과 미주. 식품영양학...  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subsr_recommendations["recommendation_score"] = subsr_predictions.values


In [10]:
print(subsr, '유저가 본 vod 목록')
print(len(subsr_vods))
subsr_vods

61683000 유저가 본 vod 목록
82


Unnamed: 0.1,Unnamed: 0,vod_id,asset_nm,ct_cl,genre_of_ct_cl,SMRY,ACTR_DISP,genres
23,23,23,놀면 뭐하니? 198회(23/07/29),TV 연예/오락,기타,청순함(?) 흩날리는 보디빌더학과 이경. 한국어가 주전공 영문학과 미주. 식품영양학...,"유재석,하하,박진주,이이경,미주,주우재",TV연예오락 기타
70,70,70,(HD)런닝맨 665회(23/07/30),TV 연예/오락,기타,여름방학 편 - 런닝맨이 또 떴다. 4주간 진행되는 런떴 여름방학 프로젝트! 패밀리...,"유재석,지석진,김종국,하하,송지효,양세찬,전소민",TV연예오락 기타
248,248,248,오은영 리포트 결혼 지옥 49회(23/07/31),TV 시사/교양,기타,본 회차는 방송사의 사정으로 줄거리를 제공하지 않습니다.,"오은영,소유진,하하,김응수,박지민",TV시사교양 기타
279,279,279,(HD)그것이알고싶다 1354회(23/06/03),TV 시사/교양,기타,두 소녀의 마지막 약속 - 대구 여중생 실종 사건. 친구들의 기억과 증언을 바탕으로...,김상중,TV시사교양 기타
356,356,356,(HD)나 혼자산다 505회(23/07/28),TV 연예/오락,기타,네가 알던 내가 아냐 편. 무지개 회원들의 반전 가득한 하루! 쇼 음악중심 스페셜 ...,"전현무,기안84,박나래,키,코드쿤스트",TV연예오락 기타
...,...,...,...,...,...,...,...,...
3733,3733,3733,홍김동전 54회(23/09/21),TV 연예/오락,기타,불운의 아이콘 홍김동전에도 해가 떴다! 오늘 날씨 맑음 홍콩 촬영 제2탄! 태풍이 ...,"홍진경,김숙,조세호,주우재,장우영",TV연예오락 기타
3767,3767,3767,(HD)궁금한이야기Y 655회(23/09/22),TV 연예/오락,기타,"1. K 방송사 마지막 개그맨, 최 OO입니다. 강아지 학대범 최 씨는 왜 개그맨을...","김석훈,정미선",TV연예오락 기타
3940,3940,3940,(HD)그것이알고싶다 1370회(23/09/23),TV 시사/교양,기타,쌀 포대와 돌멩이 - 범인은 무엇을 감추려 했나? 세부에서 변사체로 발견된 서범석 ...,김상중,TV시사교양 기타
3953,3953,3953,동상이몽 2-너는 내 운명 310회(23/09/25),TV 연예/오락,기타,레이디제인♥임현태. 레이디제인이 6개월간 직접 구상한 유럽 휴양지st 인테리어! 결...,"김구라,서장훈,김숙",TV연예오락 기타


In [8]:
subsr_recommendations

Unnamed: 0.1,Unnamed: 0,vod_id,asset_nm,ct_cl,genre_of_ct_cl,SMRY,ACTR_DISP,genres,recommendation_score
2,2,2,(HD)복면가왕 415회(23/07/30),TV 연예/오락,기타,여름 특집! 무더위를 날려버릴 시원한 가창력의 복면가수 대거 등장! 녹화 도중 사상...,"김성주,김구라",TV연예오락 기타,0.978586
62,62,62,러브 이세벨,영화,멜로,"마약, 섹X, 일탈 등 방황을 일삼던 4명의 고등학생. 어김없이 마약에 취했던 밤,...","요한나 줄리엣,가브리엘 아궤로,마리아 콘치타 알론소",영화 멜로,0.969848
76,76,76,(더빙)극장판 명탐정코난 : 할로윈의 신부,영화,애니메이션,"트리플 페이스 아무로 토오루, 그의 목에 폭탄이 채워졌다?! 극악무도한 폭파범 플라...","타카야마 미나미,야마자키 와카나,야마구치 캇페이,김선혜,강수진",영화 애니메이션,0.966706
687,687,687,극한직업(2023) 31회,다큐,인물,23.07.29 방영. 울릉도 바다에 산다 대형 여객선과 해양 경찰. 울릉크루즈 씨...,-,다큐 인물,0.963955
1135,1135,1135,더웹툰-예고살인,영화,공포/스릴러,대한민국 최초 웹툰을 소재로 한 감각적공포의 탄생. 호러퀸 이시영 그리고 엄기준의 ...,"이시영,엄기준,현우,문가영",영화 공포스릴러,0.938206
1239,1239,1239,모가디슈,영화,액션/어드벤쳐,"대한민국이 UN 가입을 위해 애쓰던 시기, 1991년 소말리아의 수도 모가디슈에서는...","김윤석,조인성,허준호,구교환,김소진,정만식,김재화,박경혜,박명신,한철우,주보비,안세...",영화 액션어드벤쳐,0.911917
1244,1244,1244,(더빙)달의 요정 세일러 문 Super S 01회,TV애니메이션,학원/순정/연애,"어느 날, 꿈속에서 신비한 페가수스를 만난 꼬마 세라. 아름다운 페가수스는 꼬마 세...","세라,대니,턱시도 가면",TV애니메이션 학원순정연애,0.907186
1530,1530,1530,타요의 씽씽극장 시즌2 03회,키즈,애니메이션,"즐거운 여행. 오늘따라 모든 일이 잘 풀리지 않는 로기. 풀이 죽어 있는데, 캠핑카...",타요,키즈 애니메이션,0.902742
3295,3295,3295,(더빙)파뿌리TV Part1 01회,TV애니메이션,기타,24시간 배틀 시리즈: 도시 vs 시골. 24시간 동안 도시 vs 시골! 시골에 가...,겜브링,TV애니메이션 기타,0.887784
3564,3564,3564,귀염뽕짝 원정대 10회(23/08/21),TV 연예/오락,기타,뒷정리 배 [무궁화꽃이 피었습니다] 게임에 나선 뽕짝꿍들! 이 정도는 껌easy! ...,"이수근,수빈,진성,노사연,강진,황민호,서지유,조승원,오지율",TV연예오락 기타,0.887322


In [9]:
subsr_predictions

vod_id
1244    0.978586
62      0.969848
1530    0.966706
2       0.963955
687     0.938206
3295    0.911917
1239    0.907186
1135    0.902742
76      0.887784
3564    0.887322
Name: 61683000, dtype: float64