In [1]:
import numpy

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = numpy.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [4]:
R = [

     [5,,,],

     [4,,,1],

     [1,,,],

     [,,,4],

     [,1,,],
    
     [2,,,0],

    ]

R = numpy.array(R)
# N: num of User
N = len(R)
# M: num of Movie
M = len(R[0])
# Num of Features
K = 3

 
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

 

nP, nQ = matrix_factorization(R, P, Q, K)

nR = numpy.dot(nP, nQ.T)

SyntaxError: invalid syntax (2831805224.py, line 3)

In [3]:
nR

array([[5.01515027, 2.90839315, 3.6626842 , 0.99898363],
       [3.97613904, 2.35281884, 3.06613421, 0.99580939],
       [1.08724407, 0.83412789, 5.19842487, 4.96071447],
       [0.98372969, 0.76881276, 4.19891791, 3.97242263],
       [1.94947728, 1.12036158, 4.92494926, 4.03316774],
       [1.85855505, 1.16488659, 3.02399568, 2.17279283]])

In [5]:
import pandas as pd
test_C = pd.read_csv('P_coffees.csv')
test_U = pd.read_csv('P_users.csv')
test_CL = pd.read_csv('P_coffeelens.csv')
test_R = pd.read_csv('P_ratings.csv')

In [6]:
# @title Load the MovieLens data (run this cell).

# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [7]:
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [8]:
test_R.head()

Unnamed: 0,userid,coffeeid,Stars,created_date
0,0,60,4,2023-11-15 12:45:57.508133
1,0,149,5,2023-11-15 12:45:57.511119
2,0,177,1,2023-11-15 12:45:57.513877
3,0,205,4,2023-11-15 12:45:57.515229
4,0,215,5,2023-11-15 12:45:57.518882


In [9]:
test_U.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             46 non-null     int64 
 1   Caffeine           46 non-null     int64 
 2   CoffeeType         46 non-null     int64 
 3   CupNoteCategories  46 non-null     object
 4   Body               46 non-null     int64 
 5   Sourness           46 non-null     int64 
 6   Sweetness          46 non-null     int64 
 7   Bitterness         46 non-null     int64 
dtypes: int64(7), object(1)
memory usage: 3.0+ KB


In [10]:
# movies.info() : 1682개의 영화
ratings.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp'], dtype='object')

In [11]:
from collections import Counter
c = Counter(ratings['movie_id'])

In [19]:
test_R.head(2)

Unnamed: 0,userid,coffeeid,Stars,created_date
0,0,60,4,2023-11-15 12:45:57.508133
1,0,149,5,2023-11-15 12:45:57.511119


In [18]:
new_R = pd.read_csv('survey_review_1121.csv')  # ratings 60개 추가
new_R.drop(columns=['email'], inplace=True)
new_R.head()

Unnamed: 0,id,Stars,created_date,CoffeeID_id
0,1,2,2023-11-19 08:15:56.924843,3061
1,2,4,2023-11-19 08:15:56.928438,3198
2,3,5,2023-11-19 08:15:56.931876,3241
3,4,2,2023-11-19 08:15:56.935511,3369
4,5,4,2023-11-19 08:15:56.939011,3461


In [24]:
# idxs : list of index of new_R
# df.loc[len(df.index)] = [userid, coffeeid, Stars, created_date] 
count = 0
for i in idx_s:
    while count <= 10:
        for k in range(6):
            if new_R['CoffeeID_id'] not in test_R[test_R['userid'] == k]['coffeeid']:
                test_R.loc[len(test_R.index)] = [k, new_R['CoffeeID_id'][i], new_R['Stars'][i], new_R['created_date']]
                idx_s.remove(i)
                count += 1
    count = 0
            

[0, 1, 2]