## Importing libraries

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.preprocessing import minmax_scale

# Reading datas and preprocessing

Reading Datasets using Pandas and displaying its head, tail and shapes
<br>
Then inserting a column call `List Index` in Movies dataset using datas indices

In [5]:
movies_df = pd.read_csv('./Dataset/movies.csv')
ratings_df = pd.read_csv('./Dataset/ratings.csv')

In [6]:
def df_disp(df : pd.DataFrame , name : str , num = 5) -> None:
    print(f'\n{name} Data Frame :')
    print('Head :')
    display(df.head(num))
    print('Tail :')
    display(df.tail(num))
    print(f'Shape : {df.shape}')

In [7]:
df_disp(movies_df , 'Movies')
df_disp(ratings_df , 'Ratings')


Movies Data Frame :
Head :


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Tail :


Unnamed: 0,movieId,title,genres
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


Shape : (9742, 3)

Ratings Data Frame :
Head :


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Tail :


Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


Shape : (100836, 4)


In [8]:
movies_df.insert(0 , 'List Index' , movies_df.index)

Merging 2 datasets based on `movieId`

In [9]:
df = pd.merge(movies_df , ratings_df , on='movieId')

In [10]:
df.head()

Unnamed: 0,List Index,movieId,title,genres,userId,rating,timestamp
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


Removing Extra columns

Store titles in a dictionary and map it to List Indices - then we can delete `title` column and map names anytime we want
<br>
"List Index" and "movieId" are the same with diffrent number so `movieId` can be removed too
<br>
We won't use `genres` and `timestamp` so we delete them too


In [12]:
idx_name = dict(zip(df['List Index'],df['title']))
df = df.drop(['movieId' , 'title' , 'genres' , 'timestamp'] , axis=1)

In [13]:
df

Unnamed: 0,List Index,userId,rating
0,0,1,4.0
1,0,5,4.0
2,0,7,4.5
3,0,15,2.5
4,0,17,4.5
...,...,...,...
100831,9737,184,4.0
100832,9738,184,3.5
100833,9739,184,3.5
100834,9740,184,3.5


Group datas by userId

In [14]:
grouped_df = df.groupby('userId')
Ids = df['userId'].unique()
display(grouped_df.get_group(Ids[15]))

Unnamed: 0,List Index,userId,rating
15,0,45,4.0
387,4,45,3.0
442,5,45,4.0
543,6,45,3.0
752,10,45,3.0
...,...,...,...
85097,6497,45,5.0
85287,6520,45,4.5
85302,6521,45,4.5
85467,6530,45,5.0


Make a X `matrix (n×m)` whith n users and m ratings - put missing data as 0
<br>
Normalize X and save it as a train X
<br>
Then make a torch custom dataset for `torch DataLoader`

In [15]:
idx = df['List Index'].unique()
m = idx.max() + 1
n = len(grouped_df)

In [16]:
X = np.zeros((n,m))
for i , d in enumerate(grouped_df):
    dd = d[1]
    idx_dd = dd['List Index'].to_numpy()
    rtg_dd = dd['rating'].to_numpy()
    X[i][idx_dd] = rtg_dd

In [17]:
train_X = minmax_scale(X , (0,1))

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class CustomDataset(Dataset):
    def __init__(self , data):
        self.data = torch.tensor(data , dtype=torch.float32 , device=device)

    def __len__(self):
        return(len(self.data))

    def __getitem__(self,idx):
        data = self.data[idx]
        return data

In [19]:
dataset = CustomDataset(train_X)

In [20]:
dataset.data

tensor([[0.8000, 0.0000, 0.8000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.5000, 0.4000, 0.4000,  ..., 0.0000, 0.0000, 0.0000],
        [0.6000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')

Now to make the model and train it
<br>
<br>
<img src='./RBM.png'>
<br>
As shown in image we have only 1 visible and 1 hidden layer
<br>
So we must give data to model and give it back - then we can use SGD to minimize error
<br>

In [21]:
class RBM(nn.Module):
    def __init__(self , n_visible = 100 , n_hidden = 20):
        super(RBM , self).__init__()
        self.W = nn.Parameter(torch.randn(n_hidden , n_visible).to(device)*1e-2)
        self.v_bias = nn.Parameter(torch.zeros(n_visible).to(device))
        self.h_bias = nn.Parameter(torch.zeros(n_hidden).to(device))
    
    def v_to_h(self , v : torch.Tensor):
        return torch.sigmoid( F.linear(v , self.W , self.h_bias))
    
    def h_to_v(self , h : torch.Tensor):
        return torch.sigmoid( F.linear(h , self.W.T , self.v_bias))

    def forward(self , v):
        h_ = self.v_to_h(v)
        v_ = self.h_to_v(h_)
        return v , v_


In [22]:
rbm = RBM(m , 20)

In [23]:
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.SGD(rbm.parameters() , 0.1)

Load 61 datas to train mini batch base

In [25]:
dataloader = DataLoader(dataset , 61 , shuffle=True)

Train the model for 20 epoch

In [26]:
loss_arr = np.array([])
vloss_arr = np.array([])
for epoch in range(20):
    running_loss = 0.0
    for i , data in enumerate(dataloader):
        xdata = data
        optimizer.zero_grad()
        v , v_ = rbm(xdata)
        loss = loss_function(v_ , v)
        loss.backward()
        optimizer.step()        
        running_loss = loss.item() + running_loss
    print(f'epoch={epoch} , running loss={running_loss : .4f}')
    loss_arr = np.append(loss_arr , running_loss/i)

epoch=0 , running loss= 2.4748
epoch=1 , running loss= 2.4745
epoch=2 , running loss= 2.4741
epoch=3 , running loss= 2.4737
epoch=4 , running loss= 2.4734
epoch=5 , running loss= 2.4730
epoch=6 , running loss= 2.4726
epoch=7 , running loss= 2.4723
epoch=8 , running loss= 2.4719
epoch=9 , running loss= 2.4716
epoch=10 , running loss= 2.4712
epoch=11 , running loss= 2.4708
epoch=12 , running loss= 2.4705
epoch=13 , running loss= 2.4701
epoch=14 , running loss= 2.4698
epoch=15 , running loss= 2.4694
epoch=16 , running loss= 2.4690
epoch=17 , running loss= 2.4687
epoch=18 , running loss= 2.4683
epoch=19 , running loss= 2.4680


Allow model to suggest 15 movies for a user

In [27]:
v , v_ = rbm(dataset.data[75])

In [28]:
def show_15_max(t : torch.Tensor):
    vv = t.clone()
    for i in range(15):
        arg = torch.argmax(vv).item()
        vv[arg] = vv[arg] - 10
        print(idx_name[arg])

In [29]:
print('15 max rated by user:')
show_15_max(v)


15 max rated by user:
Godfather, The (1972)
Monty Python and the Holy Grail (1975)
Goodfellas (1990)
Saw (2004)
Serenity (2005)
Saw II (2005)
Saw IV (2007)
Saw V (2008)
Saw VI (2009)
Saw VII 3D - The Final Chapter (2010)
Pulp Fiction (1994)
Trainspotting (1996)
Monty Python's Life of Brian (1979)
Reservoir Dogs (1992)
Clockwork Orange, A (1971)


In [30]:
print('15 suggested movies by Model:')
show_15_max(v_)

15 suggested movies by Model:
From Hell (2001)
Candidate, The (1972)
After the Fox (Caccia alla volpe) (1966)
Maleficent (2014)
Sliding Doors (1998)
Sugarland Express, The (1974)
Cloudy with a Chance of Meatballs (2009)
Comedy Central Roast of James Franco (2013)
Wolf Children (Okami kodomo no ame to yuki) (2012)
Triangle (2009)
Charlie's Angels (2000)
Return of the Musketeers, The (1989)
Georgy Girl (1966)
Metroland (1997)
Sky Captain and the World of Tomorrow (2004)
