In [1]:
import numpy

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = numpy.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [4]:
R = [

     [5,,,],

     [4,,,1],

     [1,,,],

     [,,,4],

     [,1,,],
    
     [2,,,0],

    ]

R = numpy.array(R)
# N: num of User
N = len(R)
# M: num of Movie
M = len(R[0])
# Num of Features
K = 3

 
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)

 

nP, nQ = matrix_factorization(R, P, Q, K)

nR = numpy.dot(nP, nQ.T)

SyntaxError: invalid syntax (2831805224.py, line 3)

In [3]:
nR

array([[5.01515027, 2.90839315, 3.6626842 , 0.99898363],
       [3.97613904, 2.35281884, 3.06613421, 0.99580939],
       [1.08724407, 0.83412789, 5.19842487, 4.96071447],
       [0.98372969, 0.76881276, 4.19891791, 3.97242263],
       [1.94947728, 1.12036158, 4.92494926, 4.03316774],
       [1.85855505, 1.16488659, 3.02399568, 2.17279283]])

In [5]:
import pandas as pd
test_C = pd.read_csv('P_coffees.csv')
test_U = pd.read_csv('P_users.csv')
test_CL = pd.read_csv('P_coffeelens.csv')
test_R = pd.read_csv('P_ratings.csv')

In [6]:
# @title Load the MovieLens data (run this cell).

# Download MovieLens data.
print("Downloading movielens data...")
from urllib.request import urlretrieve
import zipfile

urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
zip_ref = zipfile.ZipFile('movielens.zip', "r")
zip_ref.extractall()
print("Done. Dataset contains:")
print(zip_ref.read('ml-100k/u.info'))

# Load each data set (users, movies, and ratings).
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

Downloading movielens data...
Done. Dataset contains:
b'943 users\n1682 items\n100000 ratings\n'


In [7]:
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [8]:
test_R.head()

Unnamed: 0,userid,coffeeid,Stars,created_date
0,0,60,4,2023-11-15 12:45:57.508133
1,0,149,5,2023-11-15 12:45:57.511119
2,0,177,1,2023-11-15 12:45:57.513877
3,0,205,4,2023-11-15 12:45:57.515229
4,0,215,5,2023-11-15 12:45:57.518882


In [9]:
test_U.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userid             46 non-null     int64 
 1   Caffeine           46 non-null     int64 
 2   CoffeeType         46 non-null     int64 
 3   CupNoteCategories  46 non-null     object
 4   Body               46 non-null     int64 
 5   Sourness           46 non-null     int64 
 6   Sweetness          46 non-null     int64 
 7   Bitterness         46 non-null     int64 
dtypes: int64(7), object(1)
memory usage: 3.0+ KB


In [10]:
# movies.info() : 1682개의 영화
ratings.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp'], dtype='object')

In [11]:
from collections import Counter
c = Counter(ratings['movie_id'])

In [19]:
test_R.head(2)

Unnamed: 0,userid,coffeeid,Stars,created_date
0,0,60,4,2023-11-15 12:45:57.508133
1,0,149,5,2023-11-15 12:45:57.511119


In [18]:
new_R = pd.read_csv('survey_review_1121.csv')  # ratings 60개 추가
new_R.drop(columns=['email'], inplace=True)
new_R.head()

Unnamed: 0,id,Stars,created_date,CoffeeID_id
0,1,2,2023-11-19 08:15:56.924843,3061
1,2,4,2023-11-19 08:15:56.928438,3198
2,3,5,2023-11-19 08:15:56.931876,3241
3,4,2,2023-11-19 08:15:56.935511,3369
4,5,4,2023-11-19 08:15:56.939011,3461


In [None]:
new_R['Stars']

In [24]:
# idxs : list of index of new_R
# df.loc[len(df.index)] = [userid, coffeeid, Stars, created_date] 
count = 0
for i in idx_s:
    while count <= 10:
        for k in range(6):
            if new_R['CoffeeID_id'] not in test_R[test_R['userid'] == k]['coffeeid']:
                test_R.loc[len(test_R.index)] = [k, new_R['CoffeeID_id'][i], new_R['Stars'][i], new_R['created_date']]
                idx_s.remove(i)
                count += 1
    count = 0
            

[0, 1, 2]

***
***

In [115]:
import pymysql

In [160]:
df = pd.read_csv('mysite/data/features.csv')
features = df.loc[:, df.columns != 'id']
coffees = df.reset_index(drop=False)
coffees = coffees.rename(columns={"index":"coffeeid"})

# MySQL 연결 정보
mysql_host = 'localhost'
mysql_user = 'root'
mysql_password = 'MShw1214!'
mysql_db = 'wondoodoo'

conn = pymysql.connect(host=mysql_host, user=mysql_user, password=mysql_password, database=mysql_db)

try:
    # 쿼리 작성
    review_query = f"SELECT * FROM review;"
    users_query = f"SELECT * FROM main_preference;"

    # MySQL에서 데이터 읽어오기
    users = pd.read_sql(users_query, conn)  # ['id', 'caf', 'blend', 'notes', 'sour', 'sweet', 'bitter', 'body', 'user_id']
    users.reset_index(drop=False, inplace=True)
    
    ratings = pd.read_sql(review_query, conn)   # 'id', 'Stars', 'content', 'created_date', 'Coffee_id', 'Order_id', 'user_id'
    ratings = ratings[['user_id', 'Coffee_id', 'Stars', 'created_date']]


    print(f"데이터({users.shape})를 성공적으로 불러왔습니다.")

except Exception as e:
    print(f"오류 발생: {e}")

finally:
    # 연결 닫기
    conn.close()


데이터((3, 10))를 성공적으로 불러왔습니다.


  users = pd.read_sql(users_query, conn)  # ['id', 'caf', 'blend', 'notes', 'sour', 'sweet', 'bitter', 'body', 'user_id']
  ratings = pd.read_sql(review_query, conn)   # 'id', 'Stars', 'content', 'created_date', 'Coffee_id', 'Order_id', 'user_id'


In [161]:
ratings['userid'] = ratings.merge(users, on='user_id')['index']
ratings['coffeeid'] = ratings.merge(coffees, how='left', left_on='Coffee_id', right_on='id')['coffeeid']

In [162]:
ratings.drop(columns=['user_id', 'Coffee_id'], inplace=True)
coffees.drop(columns='id', inplace=True)
users.drop(columns=['user_id', 'id'], inplace=True)
users.rename(columns={"index":"userid"}, inplace=True)

In [163]:
import json
jsonDec = json.decoder.JSONDecoder()

In [164]:
users

Unnamed: 0,userid,caf,blend,notes,sour,sweet,bitter,body
0,0,1,1,"[""\ucd08\ucf5c\ub9bf"", ""\uace0\uc18c\ud568""]",2,4,3,3
1,1,1,1,"[""\ucd08\ucf5c\ub9bf"", ""\uace0\uc18c\ud568""]",1,4,3,4
2,2,1,1,"[""\uaf43"", ""\ud5c8\ube0c""]",5,2,4,3


In [165]:
ratings

Unnamed: 0,Stars,created_date,userid,coffeeid
0,4,2023-11-23 03:55:00.373289,0,0


In [166]:
coffees.head(2)

Unnamed: 0,coffeeid,body,sour,sweet,bitter,caf,blend,single,지속가능성_0,지속가능성_공정무역,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료,초콜릿
0,0,2,3,5,1,1,1,0,0,1,...,0,1,0,0,1,0,0,0,0,1
1,1,4,2,4,1,1,1,0,0,1,...,0,0,1,0,1,0,0,1,0,1


In [167]:
#@title Solution
DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=DOT):
  """Computes the scores of the candidates given a query.
  Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding
      of item i.
    measure: a string specifying the similarity measure to be used. Can be
      either DOT or COSINE.
  Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
  """
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = u.dot(V.T)
  return scores

In [168]:
df = pd.read_csv('mysite/data/features.csv')

In [169]:
# df_copy['cosine_similarity'] = cosine_similarity_matrix[-1]
# result_df = df_copy[df_copy['id'].notna()].sort_values(by='cosine_similarity', ascending=False)[:top_n]
# return result_df['id'].tolist()

def collaborative_rec(model, measure=DOT, exclude_rated=True, k=8, userid=0):
        scores = compute_scores(
            model.embeddings["userid"][userid], model.embeddings["coffeeid"], measure)
        score_key = measure + ' score'
        df_copy = df.copy()
        df_copy[score_key] = list(scores)
        if exclude_rated:
            # remove movies that are already rated
            rated_coffees = ratings[ratings.userid == str(userid)]["coffeeid"].values
            df_copy.reset_index(drop=False, inplace=True)
            df_copy.rename(columns={'index':'coffeeid'}, inplace=True)
            df_copy = df_copy[df_copy.coffeeid.apply(lambda coffeeid: coffeeid not in rated_coffees)]
        
        result_df = df_copy.sort_values([score_key], ascending=False).head(k)
        return result_df['id'].tolist()

In [170]:
from keras.models import load_model
new_model = load_model('mysite/model/test_model.hdf5')

In [171]:
new_model = load_model('mysite/model/test_model.hdf5')
new_model.embeddings = {
    'userid': new_model.get_layer('user_embedding').weights[0].numpy(), # U (943, 30)
    'coffeeid': new_model.get_layer('coffee_embedding').weights[0].numpy() # V (1682, 30)
}

In [136]:
import numpy as np
collaborative_rec(new_model, measure=DOT, k=5, userid=3, exclude_rated=True)

[4482, 4333, 3438, 4122, 4790]

In [173]:
collaborative_rec(new_model, userid=2)

[3508, 3151, 1026, 1475, 3740, 3011, 4694, 1573]

In [172]:
users[users['userid'] == 2]

Unnamed: 0,userid,caf,blend,notes,sour,sweet,bitter,body
2,2,1,1,"[""\uaf43"", ""\ud5c8\ube0c""]",5,2,4,3


In [174]:
test_list = collaborative_rec(new_model, userid=2)

In [175]:
df[df['id'].isin(test_list)]

Unnamed: 0,id,body,sour,sweet,bitter,caf,blend,single,지속가능성_0,지속가능성_공정무역,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료,초콜릿
113,1026,4,2,3,1,1,1,0,1,0,...,0,0,1,0,1,0,0,1,0,0
172,1475,4,3,4,2,1,1,0,1,0,...,0,0,1,0,0,0,0,1,0,1
182,1573,3,3,4,3,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,1
212,3011,3,2,4,2,1,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
262,3151,4,1,3,1,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
334,3508,4,1,4,3,0,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
381,3740,3,3,4,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
507,4694,5,1,5,4,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


***
dataframes 전처리 후
***

In [176]:
from __future__ import print_function

import numpy as np
import pandas as pd
import collections
# from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

# Install spreadsheets and import authentication module.
USER_RATINGS = True

In [177]:
# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

In [178]:
# Tensorflow2로 MF 구현

import tensorflow as tf
from tensorflow import keras
from keras.models import Model

from keras.layers import Input, Embedding, Flatten, dot

In [179]:
users.shape[0], coffees.shape[0]  # user 3명, 커피 559개

(3, 559)

### building models with actual users

In [182]:
n_latent_factors = 10 # user와 movie embedding의 차원수

user_input = Input(shape=[1], name='user')
coffee_input = Input(shape=[1], name='coffee')

user_embedding = Embedding(input_dim=users.shape[0] # 3
                           , output_dim = n_latent_factors # 10
                           , name='user_embedding'
                           )(user_input)

coffee_embedding = Embedding(input_dim=coffees.shape[0] # 559
                           , output_dim = n_latent_factors # 10
                           , name='coffee_embedding'
                           )(coffee_input)

user_vec = Flatten(name='flatten_users')(user_embedding) # 1차원 배열로 변환
coffee_vec = Flatten(name='flatten_coffees')(coffee_embedding) # 1차원 배열로 변환

product = dot([coffee_vec, user_vec], axes=1) # 평점
model = Model(inputs=[user_input, coffee_input], outputs=product) # user와 movie가 주어졌을 때, 평점을 예측하는 모델

In [183]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 coffee (InputLayer)         [(None, 1)]                  0         []                            
                                                                                                  
 user (InputLayer)           [(None, 1)]                  0         []                            
                                                                                                  
 coffee_embedding (Embeddin  (None, 1, 10)                5590      ['coffee[0][0]']              
 g)                                                                                               
                                                                                                  
 user_embedding (Embedding)  (None, 1, 10)                30        ['user[0][0]']          

In [184]:
# compile
model.compile(optimizer='adam', loss='mse')




In [185]:
ratings['userid'] = ratings['userid'].astype(int)
ratings['coffeeid'] = ratings['coffeeid'].astype(int)

train_ratings, test_ratings = split_dataframe(ratings)
train_ratings.shape, test_ratings.shape

((1, 4), (0, 4))

In [187]:
train_ratings.shape[0]

1

In [188]:
history = model.fit(x=[train_ratings['userid'], train_ratings['coffeeid']],
                    y=train_ratings['Stars'], epochs=500,
                    validation_data=([test_ratings['userid'], test_ratings['coffeeid'] ],
                                     test_ratings['Stars']),
                    verbose=1, batch_size=train_ratings.shape[0])

Epoch 1/500



UnimplementedError: Graph execution error:

Detected at node mean_squared_error/Cast defined at (most recent call last):
  File "C:\Users\Playdata\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main

  File "C:\Users\Playdata\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code

  File "c:\venvs\coffee\lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\venvs\coffee\lib\site-packages\traitlets\config\application.py", line 1053, in launch_instance

  File "c:\venvs\coffee\lib\site-packages\ipykernel\kernelapp.py", line 736, in start

  File "c:\venvs\coffee\lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "C:\Users\Playdata\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 600, in run_forever

  File "C:\Users\Playdata\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 1896, in _run_once

  File "C:\Users\Playdata\AppData\Local\Programs\Python\Python310\lib\asyncio\events.py", line 80, in _run

  File "c:\venvs\coffee\lib\site-packages\ipykernel\kernelbase.py", line 516, in dispatch_queue

  File "c:\venvs\coffee\lib\site-packages\ipykernel\kernelbase.py", line 505, in process_one

  File "c:\venvs\coffee\lib\site-packages\ipykernel\kernelbase.py", line 412, in dispatch_shell

  File "c:\venvs\coffee\lib\site-packages\ipykernel\kernelbase.py", line 740, in execute_request

  File "c:\venvs\coffee\lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute

  File "c:\venvs\coffee\lib\site-packages\ipykernel\zmqshell.py", line 546, in run_cell

  File "c:\venvs\coffee\lib\site-packages\IPython\core\interactiveshell.py", line 3024, in run_cell

  File "c:\venvs\coffee\lib\site-packages\IPython\core\interactiveshell.py", line 3079, in _run_cell

  File "c:\venvs\coffee\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\venvs\coffee\lib\site-packages\IPython\core\interactiveshell.py", line 3284, in run_cell_async

  File "c:\venvs\coffee\lib\site-packages\IPython\core\interactiveshell.py", line 3466, in run_ast_nodes

  File "c:\venvs\coffee\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code

  File "C:\Users\Playdata\AppData\Local\Temp\ipykernel_6048\523606363.py", line 1, in <module>

  File "c:\venvs\coffee\lib\site-packages\keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1807, in fit

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1151, in train_step

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss

  File "c:\venvs\coffee\lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__

  File "c:\venvs\coffee\lib\site-packages\keras\src\losses.py", line 143, in __call__

  File "c:\venvs\coffee\lib\site-packages\keras\src\losses.py", line 270, in call

  File "c:\venvs\coffee\lib\site-packages\keras\src\losses.py", line 1705, in mean_squared_error

Cast string to float is not supported
	 [[{{node mean_squared_error/Cast}}]] [Op:__inference_train_function_1825]