In [1]:
from sklearn import linear_model
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from glob import glob
import shutil

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#01. 데이터 불러오기
drama = sorted(glob(os.path.join(os.getcwd(),'ost_drama_dummied9_cp949.csv')))
data = pd.read_csv(drama[0])

# type(data)
data.columns = ['user_id','drama_id','rate']

In [4]:
n_users = data.user_id.unique().shape[0]
n_users

298

In [5]:
n_items = data.drama_id.unique().shape[0]
n_items

16

In [6]:
ratings = np.zeros((n_users, n_items))
ratings.shape

(298, 16)

In [7]:
for row in data.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]

In [8]:
ratings

array([[ 0.        , -0.1235746 ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.89529605,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  4.75731217, ...,  4.75731217,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.59366573,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.84984603, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        , -1.01519993, ..., -1.01519993,
         0.        ,  0.        ]])

In [9]:
from sklearn.model_selection import train_test_split

ratings_train, ratings_test = train_test_split(ratings, test_size=0.33, random_state=42)

In [10]:
ratings_train.shape

(199, 16)

In [11]:
# 사용자 간의 유사도 행렬 만들기

from sklearn.metrics.pairwise import cosine_similarity
user_distances = cosine_similarity(ratings_train)
user_distances

array([[ 1.        , -1.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-1.        ,  1.        , -1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        , -0.81649658],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.81649658,
         0.        ,  1.        ]])

In [12]:
# user prediction 만들기

user_pred = user_distances.dot(ratings_train) / np.array([np.abs(user_distances).sum(axis=1)]).T

In [13]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [14]:
np.sqrt(get_mse(user_pred, ratings_train))

0.8446533710119778

In [15]:
np.sqrt(get_mse(user_pred, ratings_test))

0.8811544163817294

In [16]:
from sklearn.neighbors import NearestNeighbors

k=5
neigh = NearestNeighbors(n_neighbors=k, metric='cosine')

In [37]:
neigh.fit(ratings_train)

NearestNeighbors(metric='cosine')

In [38]:
top_k_distances, top_k_users = neigh.kneighbors(ratings, return_distance=True)

In [39]:
top_k_users

array([[177, 141, 175,  62,  55],
       [ 85, 124, 111, 114, 179],
       [ 38,  73,  70, 142, 178],
       ...,
       [177, 141, 175,  62,  55],
       [173, 182,  27,  84,  60],
       [ 91, 192, 115, 184,  13]], dtype=int64)

In [40]:
top_k_distances

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [1.11022302e-16, 1.83503419e-01, 1.83503419e-01, 1.83503419e-01,
        3.33333333e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.92893219e-01, 2.92893219e-01,
        2.92893219e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

In [42]:
top_k_users = pd.DataFrame(top_k_users)
top_k_users.to_csv('ranked2.csv', index=False, encoding='cp949')

In [21]:
# 루프만들기
user_pred_k = np.zeros(ratings_train.shape)

In [22]:
# for i in range(1):
for i in range(ratings_train.shape[0]):
    user_pred_k[i, :] = top_k_distances[i].T.dot(ratings_train[top_k_users][i]) / np.array([np.abs(top_k_distances[i].T).sum(axis=0)]).T

  user_pred_k[i, :] = top_k_distances[i].T.dot(ratings_train[top_k_users][i]) / np.array([np.abs(top_k_distances[i].T).sum(axis=0)]).T


In [None]:
user_pred_k

In [None]:
np.sqrt(get_mse(user_pred_k, ratings_train))

In [None]:
np.sqrt(get_mse(user_pred_k, ratings_test))