In [7]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import mplcursors

from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


from itertools import permutations # For making pairs

plt.style.use('ggplot')

%matplotlib notebook

warnings.filterwarnings('ignore')

In [8]:
df_reviews_raw = pd.read_csv('beer_reviews.csv')

df_reviews = df_reviews_raw.copy()

df_reviews = df_reviews.drop(["brewery_name", "beer_name"], axis=1)

df_reviews.head()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_abv,beer_beerid
0,10325,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,5.0,47986
1,10325,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,6.2,48213
2,10325,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,6.5,48215
3,10325,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,5.0,47969
4,1075,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,7.7,64883


In [9]:
df_reviews[df_reviews.select_dtypes(['object']).columns] = df_reviews.select_dtypes(['object']).\
                                                         apply(lambda x: x.astype('category'))

df_reviews = df_reviews.dropna()

df_reviews = df_reviews.drop_duplicates(subset= ['review_profilename','beer_beerid'], keep='first')

df_reviews = df_reviews[(df_reviews['review_overall'] >= 1)]

In [10]:
df_ratings = df_reviews[['review_profilename', 'beer_beerid', 'review_overall']]

df_ratings = df_ratings.rename(columns = {'review_profilename':'userNm', 
                                    'review_overall':'rating',
                                    'beer_beerid' : 'beerId' })
df_ratings['reviewIdx'] = df_ratings.index

df_ratings['rating'].describe()

count    1.504045e+06
mean     3.822294e+00
std      7.173639e-01
min      1.000000e+00
25%      3.500000e+00
50%      4.000000e+00
75%      4.500000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [11]:
df_users = df_ratings.groupby('userNm').size().reset_index()
df_users.columns = ["userNm", "review_num"]

df_users['userIdx'] = df_users.index
df_users['review_num'] = pd.to_numeric(df_users['review_num'])

df_ratings_500 = df_ratings.merge(df_users, left_on = 'userNm', right_on = 'userNm', how = 'left')

In [12]:
df_beers = df_ratings.groupby('beerId').size().reset_index()
df_beers.columns = ["beerId", "br_review_num"]
df_beers['br_review_num'] = pd.to_numeric(df_beers['br_review_num'])

df_ratings_500 = df_ratings_500.merge(df_beers, left_on = 'beerId', right_on = 'beerId', how = 'left')

Unnamed: 0,userNm,beerId,rating,reviewIdx,review_num,userIdx,br_review_num
375907,tigerpaws,5,3.0,393636,106,31583,420
375685,EStreet20,5,4.5,393410,408,4198,420
375686,brianj555,5,4.0,393411,19,16171,420
375687,williamherbert,5,4.0,393412,624,32796,420
375688,theghost3,5,4.5,393413,376,31379,420
...,...,...,...,...,...,...,...
504274,Radome,77312,4.0,529418,193,10055,1
652032,Radome,77313,3.0,684958,193,10055,1
504273,Radome,77314,3.5,529417,193,10055,1
943589,thepeter,77315,3.5,992958,231,31443,1


In [13]:
df_cf_test = df_ratings_500[df_ratings_500["review_num"]<=2000]
df_cf_test = df_cf_test[df_cf_test["br_review_num"]<=500]

df_cf_train = df_ratings_500[df_ratings_500["review_num"]>2000]
df_cf_train = df_cf_train[df_cf_train["br_review_num"]>500]

df_cf_test.head(), df_cf_train.head()

df_cf_train_beers = df_beers[df_beers["br_review_num"]>500]
df_cf_train_users = df_users[df_users["review_num"]  >2000]

df_cf_test_beers = df_beers[df_beers["br_review_num"]<=500]
df_cf_test_users = df_users[df_users["review_num"]  <=2000]

df_ratings_500 = df_ratings_500[df_ratings_500["review_num"]>2000]
df_ratings_500 = df_ratings_500[df_ratings_500["br_review_num"]>500]

In [20]:
onehot_encoder = OneHotEncoder()

one_hot_beers = df_cf_train_beers.copy()
one_hot_sorted_beers = one_hot_beers.sort_index()


one_hot_sorted_beerId = one_hot_sorted_beers.values[:,0].reshape(584, 1)
one_hot_beerId = one_hot_beers.values[:,0].reshape(584, 1)

onehot_sorted_encoded = onehot_encoder.fit_transform(one_hot_sorted_beerId)
onehot_encoded = onehot_encoder.fit_transform(one_hot_beerId)

one_hot_sorted_beers["oneHot"] = (tuple(onehot_sorted_encoded.toarray().astype(int)))
one_hot_beers["oneHot"] = (tuple(onehot_encoded.toarray().astype(int)))

one_hot_sorted = one_hot_sorted_beers.drop(columns=["br_review_num"])
df_ratings_encoded = df_ratings_500.merge(one_hot_sorted, left_on = 'beerId', right_on = 'beerId', how = 'left')

df_encoded = df_ratings_encoded.drop(columns=["br_review_num", "reviewIdx", "review_num", "userNm"])

df_ratings_user_group = df_encoded.groupby(['userIdx', 'beerId']).mean()

In [21]:
nn_train_data, nn_test_data = train_test_split(df_encoded, test_size = 0.2)

nn_train_data = nn_train_data.sort_values(["beerId"]) 
nn_test_data = nn_test_data.sort_values(["beerId"])

In [22]:
def pair_oneHot(data):
    pair_list = []

    for user_id in data['userIdx'].unique():
        item_list = data[data['userIdx'] == user_id]['oneHot']
        item_pair = list(permutations(item_list, 2))
        for i in item_pair:
            pair_list.append(i)
            
    return pair_list

train_pair = pair_oneHot(nn_train_data)
test_pair = pair_oneHot(nn_test_data)

In [24]:
len(train_pair), len(test_pair)

(653963638, 40969790)

In [27]:
BATCH_SIZE = 1000

train_loader = torch.utils.data.DataLoader(dataset=train_pair, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_pair, batch_size=BATCH_SIZE, shuffle=False)  

In [28]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(584, 50) # batch_size = 32 = 11, input_dim=3072 = 92
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 584)    # hidden_dim=10, output_dim=1012

    def forward(self,x):
        h = self.fc1(x)
        l = self.relu(h)
        output = self.fc2(l)
        return output

In [29]:
def training_epoch(train_loader, network, loss_func, optimizer, epoch):
    train_losses = []
    train_correct = 0
    log_interval = 500

    for batch_idx, a in enumerate(train_loader):
        # 미분값의 초기화
        optimizer.zero_grad()

        (image, label) = a
        
        image = torch.tensor(image, dtype = torch.float32)#.float()
        
        outputs = network(image)
        
        label = torch.tensor(label, dtype = torch.float32)#torch.long)
        
        loss = loss_func(outputs, label)
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print(
                'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                  .format(
                      epoch, batch_idx * len(label),
                      len(train_loader.dataset),
                      100. * batch_idx / len(train_loader),
                      loss.item()
                  )
            )
            
    return train_losses

In [30]:
def training(network, learning_rate):
    
    epoches = 10
    
    cls_loss = nn.CrossEntropyLoss() #
    optimizer = optim.SGD(network.parameters(), learning_rate, weight_decay=0.01) #
    
    train_losses_per_epoch = []
    
    for epoch in range(epoches):
        network.train()
        
        train_losses = training_epoch(train_loader,network,cls_loss,optimizer, epoch)
        
        average_loss = np.mean(train_losses)
        train_losses_per_epoch.append(average_loss)

    torch.save(network.state_dict(), 'weights_path.pth')
    w = torch.load('weights_path.pth')['fc1.weight']
    
    return w, train_losses_per_epoch

In [31]:
nn_model = Model()

weight, train_losses_per_epoch = training(nn_model, 0.005)

TypeError: only size-1 arrays can be converted to Python scalars