# SVD 알고리즘 개요
- SVD는 잠재요인을 학습하는 행렬분해 기반 협업필터링 알고리즘
- KNN이 유사도 기반 모델이라면 SVD는 잠재요인 기반 모델
- 관측된 평점 행렬을 저차주언 잠재공간으로 분해하여 사용자와 아이템의 숨겨진 특성을 학습함
- NEtflix Prize 기간 동안 Simon Funk가 대중화함

In [18]:
from hossam import *
from pandas import DataFrame, merge

from surprise import Dataset, Reader, BaselineOnly, accuracy, KNNBasic, SVD
from surprise.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [19]:
origin = load_data('ml100k-ratings')
print(f"데이터셋 크기: {origin.shape}")
print(f"열 개수: {origin.shape[1]}")
print(f"행 개수: {origin.shape[0]}")
print(origin.info())
origin.head()

[94m943명의 사용자가 1,682편의 영화에 대해 남긴 100,000개의 평점 기록으로 구성된 명시적 평가 기반 추천 시스템 학습용 데이터셋 (출처: University of Minnesota)[0m

컬럼명     의미
---------  ---------
user_id    사용자 ID
item_id    아이템 ID
rating     평점
timestamp  평가 시각

데이터셋 크기: (100000, 4)
열 개수: 4
행 개수: 100000
<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
None


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


#### 1. 분석대상 - 평점데이터

In [20]:
origin = load_data('ml100k-ratings')
origin.head()

[94m943명의 사용자가 1,682편의 영화에 대해 남긴 100,000개의 평점 기록으로 구성된 명시적 평가 기반 추천 시스템 학습용 데이터셋 (출처: University of Minnesota)[0m

컬럼명     의미
---------  ---------
user_id    사용자 ID
item_id    아이템 ID
rating     평점
timestamp  평가 시각



Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


#### 2. 분석결과 맵핑 데이터 - 영화 정보

In [21]:
metadata = load_data('ml100k-metadata')
metadata.head()

[94mml100k-ratings에 포함된 영화 제목, 공개시기, 장르 정보를 담고 있는 데이터 (출처: University of Minnesota)[0m


Unnamed: 0,item_id,title,release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


## 2. SVD 모델 적합
### 1. DataFrame을 Dataset 객체로 변환
#### 1. 평점의 범위 확인

In [22]:
rating_min = origin['rating'].min()
rating_max = origin['rating'].max()
print(f'Rating 범위: {rating_min} ~ {rating_max}')

Rating 범위: 1 ~ 5


#### 2. surprise 라이브러리에서 사용할 수 있도록 데이터셋을 변환

In [23]:
# 평점의 범위를 지정하여 Reader 객체 생성
reader = Reader(rating_scale=(rating_min, rating_max))

# Dataset 객체를 생성 - 사용자 식별자, 아이템 식별자, 평점만으로 구성된 데이터 구조가 필요하다.
data = Dataset.load_from_df(origin[['user_id', 'item_id', 'rating']], reader)

data

<surprise.dataset.DatasetAutoFolds at 0x2718dfd1a50>

### 2. SVD 모델의 주요 하이퍼 파라미터

In [24]:
param_grid = {
     'n_factors': [50, 100, 150, 200],
     'n_epochs': [20, 30, 50],
     'lr_all': [0.002, 0.005, 0.01],
     'reg_all': [0.01, 0.02, 0.05, 0.1],
     'biased': [True, False]
}

# GridSearchCV 객체를 생성 -> 메모리 부족 에러 발생
#gs = GridSearchCV(
#    KNNBasic,
#    param_grid,
#    measures = ['rmse', 'mae'],
#    cv = 5,
#    n_jobs = -1
#)

gs = RandomizedSearchCV(
    SVD,
    param_grid,
    measures = ['rmse','mae'],
    cv = 5,
    n_jobs = -1,
    random_state = 52  # 재현성을 위해 시드 설정
)
# GridSearchCV를 사용하여 최적의 하이퍼파라미터 조합을 찾는다,
# -> 원본 데이터 사용
gs.fit(data)

# 최적의 RMSE와 MAE 점수 및 해당 하이퍼파라미터 조합을 출력
print('Best RMSE:', gs.best_score['rmse'])
print('Best Params (RMSE):', gs.best_params['rmse'])

Best RMSE: 0.9085891809396159
Best Params (RMSE): {'n_factors': 200, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1, 'biased': True}


## 3. 성능평가
### 1. 훈련, 검증 데이터 분리

In [25]:
# 데이터를 학습용과 테스트용으로 분할 (80% 학습, 20% 테스트)
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 52)

# 학습용과 테스트용 데이터의 크기를 출력
print(f'Trainset 크기:{train_data.n_ratings}개')
print(f'Testset 크기:{len(test_data)}개')

Trainset 크기:80000개
Testset 크기:20000개


### 2. 최적 모델 재학습

In [26]:
# 최적 파라미터 추출
best_params = gs.best_params['rmse']

# 모델 생성
best_model = SVD(**best_params)

# 전체 데이터 학습
best_model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2718e318e50>

### 3. 예측값 생성

In [27]:
predictions = best_model.test(test_data)
predictions[:5]

[Prediction(uid=303, iid=679, r_ui=2.0, est=3.200874499010544, details={'was_impossible': False}),
 Prediction(uid=308, iid=163, r_ui=4.0, est=3.750041168521009, details={'was_impossible': False}),
 Prediction(uid=327, iid=663, r_ui=4.0, est=3.7857624625819444, details={'was_impossible': False}),
 Prediction(uid=912, iid=479, r_ui=4.0, est=4.144676666668484, details={'was_impossible': False}),
 Prediction(uid=224, iid=329, r_ui=3.0, est=2.600678541071444, details={'was_impossible': False})]

### 4. 성능평가 지표 생성

In [28]:
cv_rmse = gs.best_score['rmse']
# Train 예측 (trainset 전체를 test 형식으로 변환)
train_predictions = best_model.test(train_data.build_testset())

# Test 예측
test_predictions = best_model.test(test_data)

# 성능 계싼
train_rmse = accuracy.rmse(train_predictions, verbose = False)
train_mae = accuracy.mae(train_predictions, verbose = False)

test_rmse = accuracy.rmse(test_predictions, verbose = False)
test_mae = accuracy.rmse(test_predictions, verbose = False)

# 일반화 오차 차이
rmse_gap_train = test_rmse - train_rmse
rmse_gap_cv = test_rmse - cv_rmse
mae_gap = test_mae - train_mae

# 과적합 판정 기준 (RMSE 기준)
# 기준: test RMSE가 train RMSE보다 0.05 이상 크면 과적합 의심
if rmse_gap_train > 0.05:
    overfit_flag = '과적합 의심'
else:
    overfit_flag = '정상'

# 성능평가표 생성
result_df = DataFrame({
    'Model': ['SVD'],
    'CV_RMSE': [cv_rmse],
    'Train_RMSE': [train_rmse],
    'Test_RMSE': [test_rmse],
    'RMSE_Gap(Test-Train)': [rmse_gap_train],
    'RMSE_Gap(Test-CV)': [rmse_gap_cv],
    'Train_MAE': [train_mae],
    'Test_MAE': [test_mae],
    'MAE_Gap': [mae_gap],
    'Overfitting': [overfit_flag]
})

result_df

Unnamed: 0,Model,CV_RMSE,Train_RMSE,Test_RMSE,RMSE_Gap(Test-Train),RMSE_Gap(Test-CV),Train_MAE,Test_MAE,MAE_Gap,Overfitting
0,SVD,0.909,0.638,0.915,0.277,0.006,0.506,0.915,0.409,과적합 의심


## 5. TopN 추천
### 1. 아직 평가하지 않은 아이템에 대한 예측 수행
#### 1. 예측결과 생성

In [29]:
anti_testset = train_data.build_anti_testset()
predictions = best_model.test(anti_testset)
predictions[:5] # 예측 결과의 일부 출력

[Prediction(uid=234, iid=205, r_ui=3.5317375, est=3.541589357184377, details={'was_impossible': False}),
 Prediction(uid=234, iid=504, r_ui=3.5317375, est=3.472220045892584, details={'was_impossible': False}),
 Prediction(uid=234, iid=73, r_ui=3.5317375, est=2.873726564705869, details={'was_impossible': False}),
 Prediction(uid=234, iid=475, r_ui=3.5317375, est=3.1006027365735016, details={'was_impossible': False}),
 Prediction(uid=234, iid=294, r_ui=3.5317375, est=2.672695917780998, details={'was_impossible': False})]

#### 2. 예측결과 데이터프레임 구성


In [30]:
pred_df = DataFrame(predictions,
                    columns = ['user_id', 'item_id', 'true_rating', 'pred_rating', 'details'])

pred_df.head()

Unnamed: 0,user_id,item_id,true_rating,pred_rating,details
0,234,205,3.532,3.542,{'was_impossible': False}
1,234,504,3.532,3.472,{'was_impossible': False}
2,234,73,3.532,2.874,{'was_impossible': False}
3,234,475,3.532,3.101,{'was_impossible': False}
4,234,294,3.532,2.673,{'was_impossible': False}


### 2. 특정 사용자에 대한 상위 10개의 추천 영화 검색
#### 1. 76번 사용자에 대한 Top 10 추천 데이터

In [31]:
N = 10
user_id = 76

topn_df = pred_df[pred_df['user_id'] == user_id]

topn_df = (
    topn_df[['user_id', 'item_id', 'pred_rating']]
    .sort_values(['pred_rating'], ascending = [False])
    .groupby('user_id')
    .head(N)
    .reset_index(drop = True)
)

topn_df

Unnamed: 0,user_id,item_id,pred_rating
0,76,1449,4.455
1,76,647,4.449
2,76,134,4.432
3,76,1512,4.414
4,76,285,4.414
5,76,483,4.407
6,76,654,4.387
7,76,127,4.379
8,76,178,4.268
9,76,408,4.255


#### 2. 메타데이터와 병합하여 영화 정보 생성

In [32]:
movie_df = topn_df.merge(metadata, on = 'item_id', how = 'left')
movie_df

Unnamed: 0,user_id,item_id,pred_rating,title,release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,76,1449,4.455,Pather Panchali (1955),22-Mar-1996,http://us.imdb.com/M/title-exact?Pather%20Panchali%20(1955),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,76,647,4.449,Ran (1985),01-Jan-1985,http://us.imdb.com/M/title-exact?Ran%20(1985),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
2,76,134,4.432,Citizen Kane (1941),01-Jan-1941,http://us.imdb.com/M/title-exact?Citizen%20Kane%20(1941),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,76,1512,4.414,"World of Apu, The (Apur Sansar) (1959)",05-Apr-1996,http://us.imdb.com/M/title-exact?Apur%20Sansar%20(1959),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,76,285,4.414,Secrets & Lies (1996),04-Oct-1996,http://us.imdb.com/M/title-exact?Secrets%20&%20Lies%20(1996),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,76,483,4.407,Casablanca (1942),01-Jan-1942,http://us.imdb.com/M/title-exact?Casablanca%20(1942),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
6,76,654,4.387,Chinatown (1974),01-Jan-1974,http://us.imdb.com/M/title-exact?Chinatown%20(1974),0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0
7,76,127,4.379,"Godfather, The (1972)",01-Jan-1972,"http://us.imdb.com/M/title-exact?Godfather,%20The%20(1972)",0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
8,76,178,4.268,12 Angry Men (1957),01-Jan-1957,http://us.imdb.com/M/title-exact?12%20Angry%20Men%20(1957),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,76,408,4.255,"Close Shave, A (1995)",28-Apr-1996,"http://us.imdb.com/M/title-exact?Close%20Shave,%20A%20(1995)",0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
