### LightFM 사용하기 위한 전처리

In [6]:
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
from matplotlib import font_manager, rc
import platform

if platform.system() == 'Darwin':
    rc('font', family = 'AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname = 'c:/Windows/Fonts/malgun.ttf').get_name()
    rc('font', family=font_name)

In [7]:
vod = pd.read_csv('../data/vod89.csv')
vod

Unnamed: 0,userid,program,score,main_cat,sub_cat
0,59879000,소방서 옆 경찰서,0.244476,TV드라마,기타
1,59879000,신성한 이혼,0.738281,TV드라마,기타
2,59895000,금이야 옥이야,1.000000,TV드라마,기타
3,59900000,초대: 스와핑 데이,0.292893,영화,멜로
4,59900000,후궁 제왕의첩,0.292893,영화,멜로
...,...,...,...,...,...
1495,67140000,잠자는 숲속의 공주,0.292893,키즈,기타
1496,67140000,밀수,0.250000,영화,액션/어드벤쳐
1497,67140000,경남 통영 2부,0.292893,우리동네,연예/오락
1498,67148000,타요의 씽씽극장 동요2,0.988951,키즈,기타


In [9]:
import numpy as np
def convert_score_to_categorical(score):
    if score <= 0.2:
        return 1
    elif score <= 0.4:
        return 2
    elif score <= 0.6:
        return 3
    elif score <= 0.8:
        return 4
    else:
        return 5

# Applying the function to the 'score' column
vod['categorical_score'] = vod['score'].apply(convert_score_to_categorical)

vod.head()

Unnamed: 0,userid,program,score,main_cat,sub_cat,program_encoded,categorical_score
0,59879000,소방서 옆 경찰서,0.244476,TV드라마,기타,390,2
1,59879000,신성한 이혼,0.738281,TV드라마,기타,436,4
2,59895000,금이야 옥이야,1.0,TV드라마,기타,92,5
3,59900000,초대: 스와핑 데이,0.292893,영화,멜로,670,2
4,59900000,후궁 제왕의첩,0.292893,영화,멜로,807,2


In [10]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Label encode the 'program' column
vod['program_encoded'] = label_encoder.fit_transform(vod['program'])

# Display the first few rows of the dataset with the encoded program labels
vod.head()

Unnamed: 0,userid,program,score,main_cat,sub_cat,program_encoded,categorical_score
0,59879000,소방서 옆 경찰서,0.244476,TV드라마,기타,390,2
1,59879000,신성한 이혼,0.738281,TV드라마,기타,436,4
2,59895000,금이야 옥이야,1.0,TV드라마,기타,92,5
3,59900000,초대: 스와핑 데이,0.292893,영화,멜로,670,2
4,59900000,후궁 제왕의첩,0.292893,영화,멜로,807,2


In [11]:
vod_final = vod[['userid', 'program_encoded', 'categorical_score']]
vod_final.columns = ['userid', 'programid', 'rating']

program_info = vod[['program', 'program_encoded', 'main_cat', 'sub_cat']].drop_duplicates()
program_info.columns = ['program_nm', 'programid', 'main_cat', 'sub_cat']

In [12]:
vod_final.to_csv('../data/vod_ratings.csv', index=0)
program_info.to_csv('../data/program_info.csv', index = 0)

### LightFM 모델 적용

#### prepare data

In [None]:
# !pip install lightfm

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import lightfm
from lightfm import LightFM, cross_validation
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

In [2]:
vod_ratings = pd.read_csv('../data/vod_ratings.csv')
vod_ratings

Unnamed: 0,userid,programid,rating
0,59879000,390,2
1,59879000,436,4
2,59895000,92,5
3,59900000,670,2
4,59900000,807,2
...,...,...,...
1495,67140000,593,2
1496,67140000,281,2
1497,67140000,44,2
1498,67148000,707,5


In [3]:
dataset = Dataset()
dataset.fit(users = vod_ratings['userid'],
            items = vod_ratings['programid'])

num_users, num_vods = dataset.interactions_shape()
print('유저 수 :',num_users)
print('프로그램 수 :',num_vods)

유저 수 : 332
프로그램 수 : 817


In [4]:
(interactions, weights) = dataset.build_interactions(vod_ratings.values)

In [5]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=0.2, random_state=42)

train_weights, test_weights = cross_validation.random_train_test_split(
    weights, test_percentage=0.2, random_state=42)

In [6]:
print(train_interactions.shape)
print(test_interactions.shape)

(332, 817)
(332, 817)


#### model fitting

In [7]:
# loss : warp, bpr, logistic, warp-kos
# learning_schedule : adagrad, adadelta
model1 = LightFM(loss='bpr', no_components=20, learning_rate=0.1, random_state=42)
model1.fit(interactions=train_interactions, sample_weight=train_weights, epochs=20, verbose=1)

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

: 