In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [2]:
path = './kmrd/kmr_dataset/datafile/kmrd-small/'
rating_df = pd.read_csv(path + 'rates.csv')
rating_df.columns = ['userId','movieId','rating','time']
rating_df.head()

Unnamed: 0,userId,movieId,rating,time
0,0,10003,7,1494128040
1,0,10004,7,1467529800
2,0,10018,9,1513344120
3,0,10021,9,1424497980
4,0,10022,7,1427627340


In [3]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   userId   140710 non-null  int64
 1   movieId  140710 non-null  int64
 2   rating   140710 non-null  int64
 3   time     140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB


In [4]:
movies_df = pd.read_csv(path+'movie_info.csv')
movies_df = movies_df.rename(columns={'movie':'movieId'})
movies_df.head()

Unnamed: 0,movieId,title,title_eng,year,grade,genres,people
0,10001,시네마 천국,"Cinema Paradiso , 1988",2013.0,전체 관람가,드라마/멜로/로맨스,[ 4374 178 3241 47952 47953 19538 18991 479...
1,10002,빽 투 더 퓨쳐,"Back To The Future , 1985",2015.0,12세 관람가,SF/코미디,[1076 4603 917 8637 5104 9986 7470 9987]
2,10003,빽 투 더 퓨쳐 2,"Back To The Future Part 2 , 1989",2015.0,12세 관람가,SF/코미디,[1076 4603 917 5104 391 5106 5105 5107 580 ...
3,10004,빽 투 더 퓨쳐 3,"Back To The Future Part III , 1990",1990.0,전체 관람가,서부/SF/판타지/코미디,[ 1076 4603 1031 5104 10001 5984 10002 100...
4,10005,스타워즈 에피소드 4 - 새로운 희망,"Star Wars , 1977",1997.0,PG,판타지/모험/SF/액션,[1007 535 215 1236 35]


In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    999 non-null    int64  
 1   title      992 non-null    object 
 2   title_eng  991 non-null    object 
 3   year       609 non-null    float64
 4   grade      957 non-null    object 
 5   genres     964 non-null    object 
 6   people     988 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 54.8+ KB


In [6]:
# 장르 전처리 (슬래시('/')를 기준으로 장르 분리)
movies_df['genres'] = movies_df['genres'].fillna('')  # 결측치 처리
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('/'))

# MultiLabelBinarizer를 사용해 장르를 one-hot encoding
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(movies_df['genres'])
genre_labels = mlb.classes_

# 영화 데이터프레임에 장르 원-핫 인코딩된 열 추가
genre_df = pd.DataFrame(genre_encoded, columns=genre_labels)
movies_df = pd.concat([movies_df, genre_df], axis=1)

In [7]:
# rating_df와 movies_df 결합
data = pd.merge(rating_df, movies_df, on="movieId", how="inner")

# 필요 없는 열 제거 (예: 'title', 'title_eng', 'year', 'grade', 'people' 등)
data = data.drop(columns=['title', 'title_eng', 'year', 'grade', 'people', 'genres'])

In [8]:
data.head()

Unnamed: 0,userId,movieId,rating,time,Unnamed: 5,SF,가족,공포,느와르,다큐멘터리,...,범죄,서부,서사,스릴러,애니메이션,액션,에로,전쟁,코미디,판타지
0,0,10003,7,1494128040,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,10004,7,1467529800,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
2,0,10018,9,1513344120,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,10021,9,1424497980,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,10022,7,1427627340,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
train_data = pd.DataFrame()
remaining_data = data.copy()

# 각 사용자와 영화가 최소 한 번 이상 등장하도록 데이터 선택
for user_id in data['userId'].unique():
    user_samples = remaining_data[remaining_data['userId'] == user_id]
    if len(user_samples) > 0:
        sample = user_samples.sample(1)
        train_data = pd.concat([train_data, sample])
        remaining_data = remaining_data.drop(sample.index)

for movie_id in data['movieId'].unique():
    movie_samples = remaining_data[remaining_data['movieId'] == movie_id]
    if len(movie_samples) > 0:
        sample = movie_samples.sample(1)
        train_data = pd.concat([train_data, sample])
        remaining_data = remaining_data.drop(sample.index)

# 추가로 train_data와 val_data 분리
additional_train_data, val_data = train_test_split(remaining_data, test_size=0.2, random_state=42)
train_data = pd.concat([train_data, additional_train_data]).reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [10]:
# userId와 movieId를 0부터 시작하는 연속된 인덱스로 변환
user_id_map = {id: idx for idx, id in enumerate(data['userId'].unique())}
movie_id_map = {id: idx for idx, id in enumerate(data['movieId'].unique())}

# 데이터를 매핑된 값으로 변환
data['userId'] = data['userId'].map(user_id_map)
data['movieId'] = data['movieId'].map(movie_id_map)

# train_data와 val_data에도 동일하게 매핑 적용
train_data['userId'] = train_data['userId'].map(user_id_map)
train_data['movieId'] = train_data['movieId'].map(movie_id_map)
val_data['userId'] = val_data['userId'].map(user_id_map)
val_data['movieId'] = val_data['movieId'].map(movie_id_map)
movies_df['movieId'] = movies_df['movieId'].map(movie_id_map)
# 매핑 후 결과 확인
print(f"Unique users in data: {len(user_id_map)}, Unique movies in data: {len(movie_id_map)}")

Unique users in data: 52028, Unique movies in data: 600


In [11]:
print(train_data.shape, val_data.shape)

(123092, 26) (17618, 26)


In [12]:
train_data.head()

Unnamed: 0,userId,movieId,rating,time,Unnamed: 5,SF,가족,공포,느와르,다큐멘터리,...,범죄,서부,서사,스릴러,애니메이션,액션,에로,전쟁,코미디,판타지
0,0,35,8,1451450460,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,29,9,1408888020,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,85,10,1323502620,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,123,6,1396264380,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,23,4,1444441200,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Define Dataset (DataLoader)

In [13]:
class MovieKorDataset(Dataset):
    def __init__(self, data, num_users, num_movies, num_genres):
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.features = []
        self.targets = []

        for row in data.itertuples():
            user_id = row.userId
            movie_id = row.movieId
            rating = row.rating

            # 사용자, 영화, 장르를 포함한 feature vector 생성
            feature = torch.zeros(num_users + num_movies + num_genres)
            feature[user_id] = 1
            feature[num_users + movie_id] = 1

            # 장르 인덱스에 원-핫 인코딩 설정
            genre_indices = [idx for idx, genre in enumerate(genre_labels) if row._asdict().get(genre) == 1]
            for idx in genre_indices:
                feature[num_users + num_movies + idx] = 1

            self.features.append(feature)
            self.targets.append(rating)
       
    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx]), torch.tensor(self.targets[idx], dtype=torch.float32)


In [14]:
class FactorizationMachine(nn.Module):
    def __init__(self, num_features, latent_dim):
        super(FactorizationMachine, self).__init__()
        self.w_0 = nn.Parameter(torch.zeros(1)) # bias term
        self.w = nn.Parameter(torch.zeros(num_features)) # linear weights
        self.v = nn.Parameter(torch.randn(num_features, latent_dim)) # interaction weights
    
    def forward(self, x):
        linear_part = self.w_0 + torch.matmul(x, self.w)
        interaction_part1 = torch.pow(torch.matmul(x, self.v), 2).sum(1, keepdim=True)
        interaction_part2 = torch.matmul(torch.pow(x,2), torch.pow(self.v, 2)).sum(1, keepdim=True)
        interaction_part = 0.5 * (interaction_part1 - interaction_part2).squeeze()
        
        # Sigmoid로 [0, 1] 사이로 압축한 후, 10을 곱하여 [0, 10] 범위로 변환
        output = torch.sigmoid(linear_part + interaction_part) * 10
        return output

In [15]:
def train_model(model, data_loader, criterion, optimizer, num_epochs = 10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for features, targets in data_loader:
            features, targets = features.float(), targets.float()
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss : {total_loss / len(data_loader)}')

def evaluate_model(model, criterion, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, targets in data_loader:
            features, targets = features.float(), targets.float()
            outputs = model(features)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    print(f'Validation Loss: {total_loss / len(data_loader)}')

In [16]:
num_users = data['userId'].nunique()
num_movies = data['movieId'].nunique()
num_genres = len(genre_labels)

train_dataset = MovieKorDataset(train_data, num_users, num_movies, num_genres)
val_dataset = MovieKorDataset(val_data, num_users, num_movies, num_genres)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model = FactorizationMachine(num_users + num_movies + num_genres, latent_dim=16)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

train_model(model, train_loader, criterion, optimizer, num_epochs=10)
evaluate_model(model, criterion, val_loader)

  return torch.tensor(self.features[idx]), torch.tensor(self.targets[idx], dtype=torch.float32)


Epoch 1/10, Loss : 5.700104351078717
Epoch 2/10, Loss : 5.4518586108018
Epoch 3/10, Loss : 5.442873482812464
Epoch 4/10, Loss : 5.436560073002209
Epoch 5/10, Loss : 5.432463251129819
Epoch 6/10, Loss : 5.427860880364047
Epoch 7/10, Loss : 5.4260963093685985
Epoch 8/10, Loss : 5.424204343324176
Epoch 9/10, Loss : 5.42225861736465
Epoch 10/10, Loss : 5.419615753862667
Validation Loss: 6.086238675022298


In [17]:
def add_predictions_to_val_data(val_data, val_dataset, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for features, _ in DataLoader(val_dataset, batch_size=64, shuffle=False):
            features = features.float()
            preds = model(features)
            predictions.extend(preds.numpy())

    # 예측 결과를 val_data에 추가
    val_data['predicted_rating'] = predictions
    return val_data

# 모델을 사용하여 예측 결과를 추가
val_data_with_predictions = add_predictions_to_val_data(val_data, val_dataset, model)

# 예측 결과 포함된 데이터 예시 출력
val_data_with_predictions.head()

  return torch.tensor(self.features[idx]), torch.tensor(self.targets[idx], dtype=torch.float32)


Unnamed: 0,userId,movieId,rating,time,Unnamed: 5,SF,가족,공포,느와르,다큐멘터리,...,서부,서사,스릴러,애니메이션,액션,에로,전쟁,코미디,판타지,predicted_rating
0,11775,270,8,1378031520,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.99999
1,4018,387,9,1187377080,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,10.0
2,10948,59,8,1352974740,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,9.999998
3,3023,2,10,1214055240,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,10.0
4,6661,94,9,1450512840,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,10.0


In [18]:
val_data_with_predictions.tail()

Unnamed: 0,userId,movieId,rating,time,Unnamed: 5,SF,가족,공포,느와르,다큐멘터리,...,서부,서사,스릴러,애니메이션,액션,에로,전쟁,코미디,판타지,predicted_rating
17613,11153,99,10,1166824560,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,10.0
17614,22479,352,10,1109652780,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,10.0
17615,5123,64,10,1331807460,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.0
17616,3252,349,10,1191216960,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,10.0
17617,12501,403,10,1464661320,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9.99487


In [19]:
val_data_with_predictions['predicted_rating'].describe()

count    17618.000000
mean         9.986655
std          0.289748
min          0.000735
25%         10.000000
50%         10.000000
75%         10.000000
max         10.000000
Name: predicted_rating, dtype: float64

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
y_val = val_data_with_predictions['rating']
y_pred = val_data_with_predictions['predicted_rating']
print(mean_absolute_error(y_val,y_pred))
print(mean_squared_log_error(y_val,y_pred))

1.208195113536529
0.15625439061669488


In [21]:
def get_user_top_moives(user_id, train_data, movies_df):
    user_train_data = train_data[train_data['userId']==user_id]
    user_train_data = user_train_data.sort_values(by='rating',ascending = False)

    watched_movies = pd.merge(user_train_data, movies_df[['movieId','title','genres']], on='movieId',how='left')
    watched_movies = watched_movies[['title','genres','rating']]

    print(f"user {user_id} 가 시청한 영화 (평점 순 10개):::")
    print(watched_movies)

    all_movies = set(train_data['movieId'])  
    watched_movie_ids = set(train_data[train_data['userId'] == user_id]['movieId'])
    unwatched_movie_ids = all_movies - watched_movie_ids
    print(len(unwatched_movie_ids))

    predictions = []
    for movie_id in unwatched_movie_ids:
        feature = torch.zeros(num_users + num_movies + num_genres)
        feature[user_id] = 1
        feature[num_users + movie_id] = 1

        movie_genres = movies_df[movies_df['movieId'] == movie_id]['genres'].iloc[0]
        genre_indices = [idx for idx, genre in enumerate(genre_labels) if genre in movie_genres]
        for idx in genre_indices:
            feature[num_users + num_movies + idx] = 1

        with torch.no_grad():
            pred = model(feature.float().unsqueeze(0)).item()
        predictions.append((movie_id,pred))

    top_unwatched_movies = sorted(predictions, key = lambda x:x[1], reverse=True)[:10]
    top_unwatched_movies = pd.DataFrame(top_unwatched_movies, columns=['movieId', 'predicted_rating'])

    top_unwatched_movies = pd.merge(top_unwatched_movies, movies_df[['movieId', 'title', 'genres']], on='movieId', how='left')
    top_unwatched_movies = top_unwatched_movies[['title', 'genres', 'predicted_rating']]

    print(f"\n유저 {user_id}가 시청하지 않은 영화 (예측 평점 높은 순 10개):")
    print(top_unwatched_movies)

In [22]:
train_data[train_data['userId']==user_id]

Unnamed: 0,userId,movieId,rating,time,Unnamed: 5,SF,가족,공포,느와르,다큐멘터리,...,범죄,서부,서사,스릴러,애니메이션,액션,에로,전쟁,코미디,판타지
52027,52027,215,10,1227036960,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [23]:
user_id = 2
get_user_top_moives(user_id, train_data, movies_df)

user 2 가 시청한 영화 (평점 순 10개):::
         title              genres  rating
0       시네마 천국      [드라마, 멜로, 로맨스]      10
1       늑대와 춤을       [드라마, 서부, 모험]      10
2       에이리언 2   [SF, 액션, 스릴러, 공포]      10
3       사관과 신사      [멜로, 로맨스, 드라마]      10
4          가위손      [판타지, 멜로, 로맨스]       9
5          그렘린  [판타지, 액션, 코미디, 공포]       8
6         프레데터   [SF, 액션, 스릴러, 공포]       8
7     13일의 금요일           [스릴러, 공포]       7
8       프레데터 2       [SF, 액션, 스릴러]       6
9         죠스 2       [모험, 스릴러, 공포]       6
10  아웃 오브 아프리카               [드라마]       6
11          탑건           [드라마, 액션]       5
588

유저 2가 시청하지 않은 영화 (예측 평점 높은 순 10개):
        title              genres  predicted_rating
0  빽 투 더 퓨쳐 2           [SF, 코미디]              10.0
1  빽 투 더 퓨쳐 3  [서부, SF, 판타지, 코미디]              10.0
2          이티   [판타지, SF, 모험, 가족]              10.0
3          록키           [드라마, 액션]              10.0
4        록키 2           [드라마, 액션]              10.0
5        록키 3           [드라마, 액션]              10.0
6      