# 요긴가 조
## 인공신경망을 이용한 음식점 추천
### 목표: 뉴럴 네트워크 모델을 설계한 후 학습하여 각 음식점의 embedding들을 생성하고, 음식점 embedding을 활용하여 각 사용자에게 맞춤형 영화를 추천

In [2]:
import warnings, random
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors
import numpy as np
import torch
import torch.optim as optim
from torch import nn
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist
from itertools import permutations # For making pairs

import matplotlib.font_manager as fm
from matplotlib import rc

fm.get_fontconfig_fonts()
# 디렉토리 및 파일 이름에 맞추어 변경
font_location = '/Users/giyeonlee/Downloads/추천시스템/추천시스템_프로젝트/Project_Final/D2Coding-Ver1/D2Coding/D2Coding-Ver1.3.2-20180524.ttf'
font_name = fm.FontProperties(fname=font_location).get_name()
rc('font', family=font_name)
print(font_name)
warnings.filterwarnings('ignore')

D2Coding


In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Using PyTorch version:', torch.__version__, ' Device:', device)

Using PyTorch version: 1.11.0  Device: cpu


### 크롤링한 데이터 전처리

In [4]:
df = pd.read_csv(
    '/Users/giyeonlee/Downloads/추천시스템/추천시스템_프로젝트/Project_Final/YogingaDataSet.csv'
)
df.head(2)

Unnamed: 0.1,Unnamed: 0,Restaurant,UserID,Menu,Review,Total,Taste,Quantity,Delivery,Date
0,0,꽃게나라 간장게장,wu**님,（XL）간장게장 대 정식 100%연평도암꽃게/1(메뉴 선택(1미+김치찌개+계란날밥 ...,조카가 먹고 싶다고 해서 시켰는데 잘 먹었습니다. 감사합니다.,5,5.0,5.0,5.0,2022년 4월 21일
1,1,꽃게나라 간장게장,ff**님,,안 짜서 너무 좋았어요 비린맛도 없고 깔끔한 맛이라 다음에 또 시켜먹을 거 같습니다~,5,5.0,5.0,5.0,2022년 4월 16일


In [6]:
# 별점 Total 열을 Rating으로 이름 변경 
df.rename(columns={'Total':'Rating'},inplace = True)
#음식점 이름, UserID, Rating만 남긴다.
df = df.filter(['Restaurant', 'UserID', 'Rating'])

df.head(10)

Unnamed: 0,Restaurant,UserID,Rating
0,꽃게나라 간장게장,wu**님,5
1,꽃게나라 간장게장,ff**님,5
2,꽃게나라 간장게장,zh**님,5
3,꽃게나라 간장게장,kw**님,5
4,꽃게나라 간장게장,dk**님,5
5,꽃게나라 간장게장,jw**님,5
6,키싸-디저트&눈꽃빙수,ej**님,5
7,키싸-디저트&눈꽃빙수,ej**님,5
8,키싸-디저트&눈꽃빙수,ej**님,5
9,키싸-디저트&눈꽃빙수,fo**님,4


In [7]:
# 유저와 음식점 개수 확인
n_users = len(df['UserID'].unique())
n_restaurants = len(df['Restaurant'].unique())

n_users, n_restaurants

(1309, 635)

In [8]:
df['Rating'].describe()

count    358132.000000
mean          4.739440
std           0.731522
min           0.000000
25%           5.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: Rating, dtype: float64

### 중복 리뷰는 평균 aggregation으로 처리

In [9]:
#user-restaurant utility matrix
#UserID = '손님' 제거
user_restaurant = df.pivot_table('Rating',index='UserID',columns = 'Restaurant',aggfunc='mean')
user_restaurant.drop(user_restaurant.loc[['손님'],:].index, inplace = True)
print(user_restaurant.shape)

#restaurant-restaurant utility matrix
restaurant_user = df.pivot_table('Rating',index='Restaurant',columns = 'UserID',aggfunc='mean')
restaurant_user.drop(restaurant_user[['손님']], inplace = True, axis = 1)
print(restaurant_user.shape)


(1307, 634)
(634, 1307)


### 음식점의 평균 평점을 구해 상위평점의 음식점 구하기
1. 음식점 2곳 이상 리뷰를 단 user 를 찾는다.
2. 해당 user들에게 리뷰가 달리지 않은 음식점을 Drop 한다.
3. 그렇게 만들어진 Utility Matrix로 음식점들의 ID를 설정하고 해당 정보를 저장한다.
4. Utility Matrix의 row와 column을 숫자로 변경한다.
4. 음식점들의 순열을 구해 pair에 저장한다.

In [10]:
# 상위 리뷰 처리
#sum(df_utility.count(axis=1).loc[df_utility.mean(axis=1)>=4.85])
#df_utility = restaurant_user.loc[restaurant_user.mean(axis=1)>=4.85]

#### 1. 음식점 2곳 이상 리뷰를 단 user 찾기

In [11]:
user_mask = restaurant_user.columns[restaurant_user.count(axis=0)>1]

#### 2. 해당 user들에게 리뷰가 달리지 않은 음식점을 Drop하기

In [12]:
restaurant_user[user_mask].sum(axis=1).sort_values()

Restaurant
더정원보쌈족발                    0.000000
홍대자연보쌈세트                   0.000000
서두산딤섬만두-홍대점                3.000000
마포구회맛집                     5.000000
NAMJAPIZZA&SPAGHETTI       5.000000
                           ...     
또래오래-마포망원점              2288.101669
롯데리아-망원점                2312.610164
후라이드참잘하는집-마포점           2478.504742
치킨플러스-성산점               2731.477306
대족장                     3293.180453
Length: 634, dtype: float64

In [13]:
df_utility = restaurant_user[user_mask]
df_utility.drop(['더정원보쌈족발','홍대자연보쌈세트'],inplace=True)
df_utility

UserID,-_**님,-b**님,-c**님,-e**님,-g**님,-m**님,00**님,01**님,02**님,03**님,...,zq**님,zr**님,zs**님,zt**님,zu**님,zv**님,zw**님,zx**님,zy**님,zz**님
Restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10평파스타&화덕피자,,,,,,,,,,,...,5.0,,,,,,,,,
1인기사식당,,,,,,,,5.000000,,,...,,,5.0,,,,5.0,3.0,,
1인살로만강명수아구찜앤탕-마포본점,,,,,,,,,,,...,,,,,,,,,,
1인용마라탕-서울수색점,,,,,,,,,,,...,,,,,,,,,,
1인용묵은지김치찜-서울수색점,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
회대장산지직송숙성활어전문점,,5.0,,,,,,5.000000,,,...,,,,,,,,,,
회뜨는총각-연남점,,,,,,,5.0,,,5.0,...,,,,,,,,,,
후라이드참잘하는집-마포점,5.0,,,5.0,,,,4.285714,5.0,5.0,...,,,5.0,,5.0,,,5.0,,4.777778
희야네반찬,,,,,,,,5.000000,,,...,,,,,,,,,,


#### 4. 음식점과 유저 id설정

In [14]:
restaurant_id = list(df_utility.index)
user_id = list(df_utility.columns)

In [15]:
restaurant_dict = {string:i for i,string in enumerate(restaurant_id)}
user_dict = {string:i for i,string in enumerate(user_id)}

In [16]:
restaurant_dict

{'10평파스타&화덕피자': 0,
 '1인기사식당': 1,
 '1인살로만강명수아구찜앤탕-마포본점': 2,
 '1인용마라탕-서울수색점': 3,
 '1인용묵은지김치찜-서울수색점': 4,
 '1인찌개와탕&냉면': 5,
 '1인한솥명가갈비찜-월곡점': 6,
 '1인할매보쌈족발-월곡점': 7,
 '24시 진짜루': 8,
 '5959양꼬치&마라탕': 9,
 '5인삼겹살곱창': 10,
 '60계-망원점': 11,
 '7번가피자-서교망원점': 12,
 '88전복죽': 13,
 'ASAPPIZZA-홍대점': 14,
 'BBQ-망원점': 15,
 'BHC-망원점': 16,
 'BHC-합정역점': 17,
 'JH집밥김치찌개': 18,
 'JohnHouse': 19,
 'KFC-홍익대점': 20,
 'KFC-홍익대점(심야)': 21,
 'NAMJAPIZZA&SPAGHETTI': 22,
 'NAPALPIZZA&WINE': 23,
 'OK피자시즌2': 24,
 'The진한나주곰탕1인용-홍대점': 25,
 'The청춘키친-은평점': 26,
 'USA존슨부대찌개-수색점': 27,
 '가마로강정-광흥창점': 28,
 '가마로강정-망원점': 29,
 '가마로강정-상암점': 30,
 '가마치통닭-서울망원역점': 31,
 '가분뼈다귀해장국&김치찜-강북점': 32,
 '가이오돈까스&국수': 33,
 '가장맛있는족발-상암디지털역점': 34,
 '갈비마마-응암점': 35,
 '감미분식-연남점': 36,
 '감성카츠': 37,
 '갓피맥-상암점': 38,
 '강릉어시장회': 39,
 '강릉초당골짬뽕순두부-마포점': 40,
 '강명수묵은지김치찜-마포본점': 41,
 '건강한이불덮밥&장어규동': 42,
 '걸작떡볶이치킨-서울광흥창점': 43,
 '걸작떡볶이치킨-서울상암점': 44,
 '걸작떡볶이치킨-서울신촌점': 45,
 '걸작떡볶이치킨-서울합정점': 46,
 '고기왕창자이언트비빔밥-홍대점': 47,
 '고돼지-신촌점': 48,
 '고릴라요정': 49,
 '고봉이네1인분생고기김치찌개': 50,
 '고유식탁

In [17]:
user_dict

{'-_**님': 0,
 '-b**님': 1,
 '-c**님': 2,
 '-e**님': 3,
 '-g**님': 4,
 '-m**님': 5,
 '00**님': 6,
 '01**님': 7,
 '02**님': 8,
 '03**님': 9,
 '04**님': 10,
 '05**님': 11,
 '06**님': 12,
 '07**님': 13,
 '08**님': 14,
 '09**님': 15,
 '0_**님': 16,
 '0a**님': 17,
 '0b**님': 18,
 '0d**님': 19,
 '0h**님': 20,
 '0k**님': 21,
 '0l**님': 22,
 '0m**님': 23,
 '0n**님': 24,
 '0o**님': 25,
 '0s**님': 26,
 '0w**님': 27,
 '0y**님': 28,
 '0z**님': 29,
 '10**님': 30,
 '11**님': 31,
 '12**님': 32,
 '13**님': 33,
 '14**님': 34,
 '15**님': 35,
 '16**님': 36,
 '17**님': 37,
 '18**님': 38,
 '19**님': 39,
 '1_**님': 40,
 '1a**님': 41,
 '1b**님': 42,
 '1d**님': 43,
 '1e**님': 44,
 '1g**님': 45,
 '1i**님': 46,
 '1j**님': 47,
 '1m**님': 48,
 '1n**님': 49,
 '1o**님': 50,
 '1q**님': 51,
 '1s**님': 52,
 '1w**님': 53,
 '20**님': 54,
 '21**님': 55,
 '22**님': 56,
 '23**님': 57,
 '24**님': 58,
 '25**님': 59,
 '26**님': 60,
 '27**님': 61,
 '28**님': 62,
 '29**님': 63,
 '2_**님': 64,
 '2b**님': 65,
 '2d**님': 66,
 '2e**님': 67,
 '2g**님': 68,
 '2h**님': 69,
 '2j**님': 70,
 '2k**님': 71,
 '

#### 4. Utility Matrix의 행과 열을 숫자로 변경

In [18]:
df_utility = df_utility.reset_index()

In [19]:
df_utility.index.name = 'RestaurantID'

In [20]:
df_utility = df_utility.rename(columns=user_dict)

In [21]:
df_utility = df_utility.drop(['Restaurant'],axis=1)

In [22]:
df_utility

UserID,0,1,2,3,4,5,6,7,8,9,...,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170
RestaurantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,5.0,,,,,,,,,
1,,,,,,,,5.000000,,,...,,,5.0,,,,5.0,3.0,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,,5.0,,,,,,5.000000,,,...,,,,,,,,,,
628,,,,,,,5.0,,,5.0,...,,,,,,,,,,
629,5.0,,,5.0,,,,4.285714,5.0,5.0,...,,,5.0,,5.0,,,5.0,,4.777778
630,,,,,,,,5.000000,,,...,,,,,,,,,,


#### 5. 음식점들의 순열을 구해 pair에 저장

In [23]:
df_utility[3].loc[df_utility[3].notna()].index

Int64Index([ 19, 116, 156, 217, 232, 319, 321, 367, 391, 453, 481, 503, 554,
            555, 583, 592, 612, 629],
           dtype='int64', name='RestaurantID')

In [52]:
train_pairs=[]
test_pairs=[]
for i in range(1171):
    row_ate = list(df_utility[i].loc[df_utility[i].notna()].index)
    random.shuffle(row_ate)
    if len(row_ate) >= 12:
        bound = len(row_ate)*9//10
        train_pairs = train_pairs + list(permutations(row_ate[:bound],2))
        test_pairs = test_pairs + list(permutations(row_ate[bound:],2))
    else:
        train_pairs = train_pairs + list(permutations(row_ate,2))
    if(i%110 == 0):
        print(len(train_pairs),len(test_pairs))

272 2
136268 1712
173568 2162
861508 10818
2151424 26792
3141240 39058
4515794 56088
5615900 69738
6344152 78662
7886002 97596
8339898 103186


In [53]:
random.shuffle(train_pairs)
random.shuffle(test_pairs)
train_pairs = torch.tensor(train_pairs)
test_pairs = torch.tensor(test_pairs)
print(len(train_pairs))
print(len(test_pairs))

8741950
108190


### 모델 설계 및 학습 시키기
1. 모델 설계하기
2. batch 함수
3. Train epoch 함수
4. Test epoch 함수
5. Training 함수
6. 모델 생성 및 학습

#### 1. 모델 설계

In [54]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.il = nn.Linear(632,50,bias=False)
        self.ho = nn.Linear(50,632,bias=False)
        self.relu = nn.ReLU()
    
    def forward(self,x):
        x = self.il(x)
        x = self.relu(x)
        x = self.ho(x)
        return x

In [55]:
def init_weights(m):
    if isinstance(m,nn.Linear):
        torch.nn.init.xavier_uniform(m.weight)

#### 2. batch 크기만큼 data를 잘라서 주는 함수

In [56]:
def batch(iterable,n=1):
    l = len(iterable)
    for ndx in range(0,l,n):
        bat = iterable[ndx:min(ndx+n,l)]
        yield (bat[:,0],bat[:,1])

#### 3. Train epoch 함수

In [57]:
def training_epoch(train_loader, network, loss_func, optimizer, epoch):
    train_losses = []
    train_correct = 0
    log_interval = 425
    Batch = 2048
    for batch_idx, (rest, label) in enumerate(batch(train_loader,Batch)):
        # 미분값의 초기화
        optimizer.zero_grad()
        batch_size = len(rest)
        one_rest = F.one_hot(rest,num_classes=632)
        
        # Forward propagration 계산하기
        outputs = network.forward(one_rest.float())
        
        
        # Cross_entropy 함수를 적용하여 loss를 구하고 저장하기
        loss = loss_func(outputs,label)
        train_losses.append(loss.item())

        # training accuracy 정확도 구하기 위해 맞는 샘플 개수 세기
        pred = torch.argmax(outputs,dim=1)
        train_correct += pred.eq(label).sum()
        #print(f"{pred[0]}, {label[0]}, {pred.eq(label)[0]}")
        # Gradient 구하기
        loss.backward()
        # weight값 update 하기
        optimizer.step()
        # 학습 상황 출력
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                  .format(epoch, batch_idx * len(label), len(train_loader),
                          100. * (batch_idx*Batch-(Batch-batch_size)) / len(train_loader),
                          loss.item()))
            
    return train_losses, train_correct

#### 4. Test epoch 함수

In [58]:
def test_epoch(test_loader, network, loss_func):
    correct = 0
    Batch = 2048
    test_losses = []
    
    with torch.no_grad():
        for batch_idx, (rest, label) in enumerate(batch(test_loader,Batch)):
            batch_size = len(rest)
            one_rest = F.one_hot(rest, num_classes=632)

            # Forward propagration 계산하기.
            outputs = network.forward(one_rest.float())#채우시오

            # Cross_entropy 함수를 적용하여 loss를 구하기
            loss = loss_func(outputs,label)#채우시오
            test_losses.append(loss.item())

            # Batch 별로 정확도 구하기
            pred = torch.argmax(outputs,dim=1)#채우시오
            correct += pred.eq(label).sum()

        # 전체 정확도 구하기
        test_accuracy = 100. * correct / len(test_loader)

        #중간결과 출력
        print('Test set: Accuracy: {}/{} ({:.0f}%)\n'
              .format(correct, len(test_loader),100. * correct / len(test_loader)))
    return test_losses, test_accuracy


#### 5. Training 함수

In [59]:
def training(network):
    
    learning_rate = 0.003
    epoches = 20
    
    cls_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(network.parameters(),lr=learning_rate)
    
    train_losses_per_epoch = []
    test_losses_per_epoch = []
    
    train_accuracies = []
    test_accuracies = []
    
    
    for epoch in range(epoches):
                
        # 모델를 학습 중이라고 선언하기
        network.train()
        
        train_losses, train_correct = training_epoch(train_pairs,network,cls_loss,optimizer,epoch)#채우시오
        
        # epoch 별로 loss 평균값, 정확도 구하기
        average_loss = np.mean(train_losses)
        train_losses_per_epoch.append(average_loss)
        
        train_accuracy = train_correct / len(train_pairs) * 100
        train_accuracies.append(train_accuracy)
        
        # epoch 별로 정확도 출력
        print('\nTraining set: Accuracy: {}/{} ({:.0f}%)'
              .format(train_correct, len(train_pairs),100. * train_correct / len(train_pairs)))
        
        # 모델 test 중인 것을 선언하기
        network.eval()
        correct = 0
        with torch.no_grad():
            test_losses, test_accuracy = test_epoch(test_pairs,network,cls_loss)#채우시오

        test_losses_per_epoch.append(np.mean(test_losses))
        test_accuracies.append(test_accuracy)
    
    return train_losses_per_epoch, test_losses_per_epoch, train_accuracies, test_accuracies

#### 6. 모델 생성 및 학습

In [60]:
network = Model()
network.apply(init_weights)

Model(
  (il): Linear(in_features=632, out_features=50, bias=False)
  (ho): Linear(in_features=50, out_features=632, bias=False)
  (relu): ReLU()
)

In [45]:
training(network)


Training set: Accuracy: 46967/8741950 (1%)
Test set: Accuracy: 584/108190 (1%)


Training set: Accuracy: 46933/8741950 (1%)
Test set: Accuracy: 566/108190 (1%)


Training set: Accuracy: 47057/8741950 (1%)
Test set: Accuracy: 573/108190 (1%)


Training set: Accuracy: 47134/8741950 (1%)
Test set: Accuracy: 546/108190 (1%)


Training set: Accuracy: 47130/8741950 (1%)
Test set: Accuracy: 541/108190 (1%)


Training set: Accuracy: 47320/8741950 (1%)
Test set: Accuracy: 562/108190 (1%)


Training set: Accuracy: 47309/8741950 (1%)
Test set: Accuracy: 562/108190 (1%)


Training set: Accuracy: 47744/8741950 (1%)
Test set: Accuracy: 571/108190 (1%)


Training set: Accuracy: 47834/8741950 (1%)
Test set: Accuracy: 584/108190 (1%)


Training set: Accuracy: 47881/8741950 (1%)
Test set: Accuracy: 568/108190 (1%)


Training set: Accuracy: 47796/8741950 (1%)
Test set: Accuracy: 575/108190 (1%)


Training set: Accuracy: 47862/8741950 (1%)
Test set: Accuracy: 571/108190 (1%)




Training set: Accuracy: 47979/8741950 (1%)
Test set: Accuracy: 574/108190 (1%)


Training set: Accuracy: 48034/8741950 (1%)
Test set: Accuracy: 582/108190 (1%)


Training set: Accuracy: 47997/8741950 (1%)
Test set: Accuracy: 580/108190 (1%)


Training set: Accuracy: 47993/8741950 (1%)
Test set: Accuracy: 571/108190 (1%)


Training set: Accuracy: 48107/8741950 (1%)
Test set: Accuracy: 580/108190 (1%)


Training set: Accuracy: 48022/8741950 (1%)
Test set: Accuracy: 586/108190 (1%)


Training set: Accuracy: 48039/8741950 (1%)
Test set: Accuracy: 580/108190 (1%)


Training set: Accuracy: 47980/8741950 (1%)
Test set: Accuracy: 593/108190 (1%)



([6.023145108346946,
  6.01743998475029,
  6.0170598978403165,
  6.016719524908971,
  6.016424781015646,
  6.01618072528084,
  6.0159653132848145,
  6.015772756633281,
  6.015595697803725,
  6.015422729564568,
  6.01526832692264,
  6.01512172222696,
  6.01499407663924,
  6.014889146997376,
  6.014795350257729,
  6.014709711912562,
  6.014634288315137,
  6.0145643268432405,
  6.014497159934206,
  6.014434952287993],
 [6.057094591968465,
  6.059483285220164,
  6.061497598324182,
  6.062968272083211,
  6.063788494973813,
  6.0641789616278885,
  6.064442067776087,
  6.064659685458777,
  6.064866857708625,
  6.065043008552407,
  6.0652268247784304,
  6.065358746726558,
  6.06553478960721,
  6.065650652039726,
  6.0658427724298445,
  6.06597723151153,
  6.066203153358315,
  6.066348984556378,
  6.066524343670539,
  6.066614276957962],
 [tensor(0.5373),
  tensor(0.5369),
  tensor(0.5383),
  tensor(0.5392),
  tensor(0.5391),
  tensor(0.5413),
  tensor(0.5412),
  tensor(0.5461),
  tensor(0.5472

### (선택) weights 저장

In [61]:
import pickle
with open('./weights.p', 'wb') as f:
    pickle.dump(network, f)

# 여기까지 돌려주십쇼
_________________________________________________

## 학습된 모델 파일 불러오기

In [62]:
network = Model()
network.load_state_dict(torch.load('./NeuralNetwork/best_weights/50/0.003_epoches80weights.pt',map_location=device))

<All keys matched successfully>

In [63]:
network.eval()

Model(
  (il): Linear(in_features=632, out_features=50, bias=False)
  (ho): Linear(in_features=50, out_features=632, bias=False)
  (relu): ReLU()
)

### 임베딩 가져오기

In [64]:
with torch.no_grad():
    for p in network.il.parameters():
        embed = torch.tensor(p)
embed = embed.T.numpy()
print(embed.shape)
print(embed)

(632, 50)
[[ 0.48390585 -0.01152907 -0.03111128 ... -0.01846463 -0.05292297
  -0.06639238]
 [-0.05638285 -0.0036038  -0.09035727 ... -0.07565743 -0.04619574
  -0.09322172]
 [-0.01872059 -0.0409628  -0.02051431 ... -0.08848263 -0.00742061
   0.25167188]
 ...
 [ 0.3587027  -0.03001981 -0.00709418 ... -0.02143769 -0.01964534
  -0.01642147]
 [ 0.70644665 -0.00759177 -0.00520285 ... -0.01561237 -0.06334361
  -0.06381025]
 [-0.01442345 -0.06538141 -0.01303202 ... -0.01283355 -0.00814238
  -0.08479259]]


In [65]:
df_utility

UserID,0,1,2,3,4,5,6,7,8,9,...,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170
RestaurantID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,5.0,,,,,,,,,
1,,,,,,,,5.000000,,,...,,,5.0,,,,5.0,3.0,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,,5.0,,,,,,5.000000,,,...,,,,,,,,,,
628,,,,,,,5.0,,,5.0,...,,,,,,,,,,
629,5.0,,,5.0,,,,4.285714,5.0,5.0,...,,,5.0,,5.0,,,5.0,,4.777778
630,,,,,,,,5.000000,,,...,,,,,,,,,,


### 유저의 임베딩 구하기

### 유저가 먹었던 음식점의 임베딩을 저장하는 방식

In [66]:
users_ate = []
for i in range(0,1171):
    userl = embed[df_utility.notnull()[i].tolist()]
    users_ate.append(userl)
print(len(users_ate),len(users_ate[0][0]))
integrated_ate = np.array(users_ate)

1171 50


### 유저가 먹었던 음식점 임베딩들의 평균을 저장하는 방식

In [67]:
users_embed = []
for i in range(0,1171):
    useri = embed[df_utility.notnull()[i].tolist()].mean(axis=0)
    if len(useri)==0:
        users_embed.append(torch.tensor([]))
    else:
        users_embed.append(useri)
print(f"({len(users_embed)},{len(users_embed[0])})")
print(users_embed)

(1171,50)
[array([ 0.13471694, -0.0220233 , -0.01997666, -0.02201019, -0.02639022,
        0.04477799, -0.05034122, -0.03068692, -0.02091275, -0.0297979 ,
        0.22649164, -0.03398198, -0.03301131, -0.02794442, -0.02882438,
       -0.02789955, -0.04017292, -0.02586437, -0.02849276,  0.02165649,
       -0.02174208, -0.03828077, -0.03794393, -0.01872646, -0.03015079,
       -0.03618566, -0.02090836, -0.02165811, -0.03201923, -0.02030943,
       -0.02180818, -0.01936628, -0.02535523, -0.03003129, -0.0305809 ,
       -0.00710186, -0.02548291,  0.24644983, -0.02728512, -0.03423497,
       -0.0240576 ,  0.08944957,  0.17673603, -0.03116143, -0.02309114,
       -0.02809334, -0.01662134, -0.02664121, -0.03159468, -0.02799756],
      dtype=float32), array([ 0.17331013, -0.01944017, -0.01719372, -0.02061192, -0.02515122,
        0.06111706, -0.01496974, -0.01890809, -0.02052883, -0.03284468,
        0.04615933, -0.02534069, -0.0257596 , -0.04156857, -0.02243052,
       -0.02576258, -0.0379358

In [68]:
integrated_embed = np.array(users_embed)
print(integrated_embed.shape)
print(embed.shape)

(1171, 50)
(632, 50)


In [69]:
df_restaurant=pd.DataFrame(restaurant_user.mean(axis=1))
df_restaurant.columns = ['Mean_Rating']
df_restaurant

Unnamed: 0_level_0,Mean_Rating
Restaurant,Unnamed: 1_level_1
10평파스타&화덕피자,0.558885
1인기사식당,0.727316
1인살로만강명수아구찜앤탕-마포본점,0.026014
1인용마라탕-서울수색점,0.082632
1인용묵은지김치찜-서울수색점,0.154680
...,...
회대장산지직송숙성활어전문점,0.206070
회뜨는총각-연남점,0.717973
후라이드참잘하는집-마포점,1.908573
희야네반찬,0.153787


### 먹었던 음식점들의 임베딩마다 

In [70]:
## user = 0~1171
classes = 632
def recommend_n(user,n):
    user = user
    similarities=np.zeros(classes)
    length = len(users_ate[user])
    for u_emb in users_ate[user]:
        similarities = similarities + (1-cdist(u_emb.reshape(1,-1), embed, metric='cosine'))
    similarities = similarities/length
    #print(similarities.shape)
    #print(similarities)
    #similarities.sort(descending=True)
    sorted_arg = np.argsort(similarities[0])[::-1]
    sorted_arg = sorted_arg[:n]
    recommendations = []
    recom_sim = []
    for i in sorted_arg:
        recommendations.append(restaurant_id[i])
        recom_sim.append(similarities[0][i])
    return recommendations, recom_sim
def restaurantInfo_n(rests,sims):
    tmp = df_restaurant.loc[rests]
    tmp['similarity'] = sims
    return tmp

In [71]:
## user = 0~1171
def recommend(user,n):
    user = user
    similarities = 1 - cdist(integrated_embed[user].reshape((1,-1)), embed, metric='cosine')
    #print(similarities.shape)
    #print(similarities)
    #similarities.sort(descending=True)
    sorted_arg = np.argsort(similarities[0])[::-1]
    sorted_arg = sorted_arg[:n]
    recommendations = []
    recom_sim = []
    for i in sorted_arg:
        recommendations.append(restaurant_id[i])
        recom_sim.append(similarities[0][i])
    return recommendations, recom_sim
def restaurantInfo(rests,sims):
    tmp = df_restaurant.loc[rests]
    tmp['similarity'] = sims
    return tmp

In [72]:
## user = [1,1170]
recom_restaurantId,recom_similarity = recommend_n(104,10)
restaurantInfo_n(recom_restaurantId,recom_similarity)

Unnamed: 0_level_0,Mean_Rating,similarity
Restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1
청진동해장국&감자탕-신촌점,0.999164,0.579126
정성이가득찬집밥-이대점,1.047826,0.571093
신미경홍대닭갈비3,0.85365,0.564003
밥도둑들,1.322749,0.55669
정가네석쇠불고기,0.638791,0.556178
치킨플러스-신수점,1.455463,0.555494
24시 진짜루,1.064909,0.547739
호세야오리바베큐-은평점,1.213574,0.542309
청년치킨-서교점,1.457313,0.541899
참치의명가&야식포차,0.779448,0.541436


In [73]:
## user = [1,1170]
recom_restaurantId,recom_similarity = recommend(0,10)
restaurantInfo(recom_restaurantId,recom_similarity)

Unnamed: 0_level_0,Mean_Rating,similarity
Restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1
피자파는집-홍대점,1.074402,0.875406
처갓집양념치킨-신촌점,1.47036,0.871965
김밥천국-망원점,1.335886,0.865848
한끼한상-홍대점,0.297628,0.861926
손오공마라탕-서교점,1.3662,0.856168
신미경홍대닭갈비3,0.85365,0.843614
굽네치킨&피자-북가좌1호점,1.704254,0.837528
처갓집양념치킨-서교점,1.10572,0.829281
7번가피자-서교망원점,1.080657,0.823512
청년치킨-서교점,1.457313,0.820213


In [38]:
restaurant_id[71]

'굽네치킨&피자-북가좌1호점'

In [39]:
df_utility[0].loc[df_utility[0].notna()]

RestaurantID
71     5.0
74     4.0
176    5.0
208    5.0
277    5.0
309    5.0
343    5.0
376    5.0
398    5.0
425    5.0
429    4.0
453    5.0
523    5.0
527    5.0
567    5.0
592    5.0
615    5.0
619    5.0
629    5.0
Name: 0, dtype: float64

In [40]:
[restaurant_id[id] for id in list(df_utility[0].loc[df_utility[0].notna()].index)]

['굽네치킨&피자-북가좌1호점',
 '굽네치킨&피자-연남점',
 '롯데리아-망원점',
 '매운국물떡볶이밀방떡-홍대입구점',
 '불타는형제들연탄불고기',
 '섬이자카야',
 '신미불닭발-서울본점',
 '야한곱창-홍대점',
 '옥이네김치찌개와직화삼겹-신촌점',
 '육회바른연어-홍대점',
 '이가네간장&양념게장',
 '정성이가득찬집밥-이대점',
 '키싸-디저트&눈꽃빙수',
 '타이반쩜',
 '피자파는집-홍대점',
 '행복왕갈비',
 '홍대미남보쌈족발-서교점',
 '홍리마라탕',
 '후라이드참잘하는집-마포점']

In [41]:
#결측치 0으로 대체
restaurant_user.fillna(0,inplace = True)
user_restaurant.fillna(0,inplace = True)
df_um = df_utility.fillna(0)

In [42]:
plt.style.use('ggplot')
%matplotlib notebook

In [43]:
# your code here

model_tsne = TSNE(n_components=2)# ,init='random')
W_embedded = model_tsne.fit(embed)# W는 영화에 대한 임베딩이다.

# 실제 2차원으로 변환되었는지 확인
# 변환된 결과는 (n_movies, 2)의 형태
print(W_embedded.embedding_.shape)

NameError: name 'embed' is not defined

In [44]:
df_um.rename(index={ID:restaurant for ID,restaurant in enumerate(restaurant_id)},inplace=True)

In [45]:
df_um.mean(axis=1)

RestaurantID
10평파스타&화덕피자           0.623794
1인기사식당                0.811787
1인살로만강명수아구찜앤탕-마포본점    0.029035
1인용마라탕-서울수색점          0.092229
1인용묵은지김치찜-서울수색점       0.172644
                        ...   
회대장산지직송숙성활어전문점        0.230003
회뜨는총각-연남점             0.797089
후라이드참잘하는집-마포점         2.116571
희야네반찬                 0.171648
힐링스시-본점               0.087105
Length: 632, dtype: float64

In [46]:
# your code here

# 앞서 정의헀던 Utility Matrix와 동일한 방법으로 A2를 정의 (numpy array로 변환할 필요 없음)
A2 = df_um

# allList엔 A2의 index 값(즉, movieId)들을 list로 변환한 내용이 들어있음
allList = A2.index.values.tolist()

# movie_with_embedding은 df_movies에서 allList와 일치하는 부분만을 가지게 됨
# movie_with_embedding에 'tsne1', 'tsne2' column을 추가 (추가하는 내용은 각각 임베딩의 첫번째 column, 두번째 column)
movie_with_embedding = pd.DataFrame(df_um.mean(axis=1))
movie_with_embedding.columns = ['MeanRating']
movie_with_embedding['tsne1'] = W_embedded.embedding_[:,0]
movie_with_embedding['tsne2'] = W_embedded.embedding_[:,1]

NameError: name 'W_embedded' is not defined

In [47]:
movie_with_embedding

Unnamed: 0_level_0,MeanRating
RestaurantID,Unnamed: 1_level_1
10평파스타&화덕피자,0.623794
1인기사식당,0.811787
1인살로만강명수아구찜앤탕-마포본점,0.029035
1인용마라탕-서울수색점,0.092229
1인용묵은지김치찜-서울수색점,0.172644
...,...
회대장산지직송숙성활어전문점,0.230003
회뜨는총각-연남점,0.797089
후라이드참잘하는집-마포점,2.116571
희야네반찬,0.171648


In [48]:
movie_with_embedding.loc['힐링스시-본점'].name

'힐링스시-본점'

In [49]:
plt.rcParams['figure.figsize'] = [10, 10] # you can change size for your style
plt.xlim(movie_with_embedding['tsne1'].min(), movie_with_embedding['tsne1'].max()) # 축 범위 조정
plt.ylim(movie_with_embedding['tsne2'].min(), movie_with_embedding['tsne2'].max()) # 축 범위 조정

# your code here

# Scatter plot을 그리기
plt.scatter(movie_with_embedding['tsne1'],movie_with_embedding['tsne2'])

# 위의 mplcursors 코드를 이용해 각 point마다 labelling을 할 수 있습니다.
# YOUR_ANNOTATION_LIST는 영화 제목이 되어도 좋고, 영화 장르가 되어도 좋습니다.
# (직관성을 위해 '영화장르'로 labelling 하는 것을 추천합니다.)
mplcursors.cursor(multiple = True).connect(
    "add", lambda sel: sel.annotation.set_text(
          movie_with_embedding.iloc[sel.target.index].name
))
plt.title('t-sne result (visualization of movie embeddings)')
plt.show()

KeyError: 'tsne1'

In [74]:
user_id[0]

'-_**님'