In [29]:
import pandas as pd
import torch
from random import randint

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data

In [3]:
raw_users = pd.DataFrame([
  {"age": 33, "country": "USA", "name": "John"},
  {"age": 60, "country": "UK", "name": "Ben"},
  {"age": 22, "country": "Ukraine", "name": "Mike"},
  {"age": 27, "country": "USA", "name": "Adam"},
  {"age": 54, "country": "France", "name": "Macron"}
])

raw_items = pd.read_csv('movies.csv')


In [4]:
raw_items.sample(5)

Unnamed: 0,Movie,LeadStudio,RottenTomatoes,AudienceScore,Story,Genre,TheatersOpenWeek,OpeningWeekend,BOAvgOpenWeekend,DomesticGross,ForeignGross,WorldGross,Budget,Profitability,OpenProfit,Year
758,Silver Linings Playbook,Weinstein,92.0,86.0,,,16.0,0.44,27688.0,132.09,104.32,236.41,21.0,1125.77,2.1,2012
7,National Treasure: Book of Secrets,Disney,31.0,72.0,The Riddle,Thriller,3832.0,44.8,11686.0,219.96,237.4,457.36,130.0,351.82,34.46,2007
550,Gnomeo and Juliet,Disney,56.0,52.0,Love,Animation,2994.0,25.36,8469.0,99.97,94.0,193.97,36.0,538.8,70.44,2011
167,Tyler Perry's The Family That Preys,Independent,51.0,36.0,Discovery,Drama,2070.0,17.38,8397.0,37.11,,37.11,10.0,371.05,173.8,2008
548,Friends With Benefits,Warner Bros,71.0,68.0,Temptation,Romance,2926.0,18.62,6364.0,55.8,93.74,149.54,35.0,427.26,53.2,2011


## Preprocessing

In [5]:
# choose relevant fields
items = raw_items[["LeadStudio", "AudienceScore", "Genre", "Year"]].copy()
# items['metadata'] = json.dumps({})
items

Unnamed: 0,LeadStudio,AudienceScore,Genre,Year
0,Sony,54.0,Action,2007
1,Paramount,57.0,Animation,2007
2,Paramount,89.0,Action,2007
3,Disney,74.0,Action,2007
4,Warner Bros,82.0,Adventure,2007
...,...,...,...,...
965,IFC,,,2013
966,TriStar,66.0,,2013
967,Cinedigm Entertainment,,,2013
968,Magnolia,53.0,,2013


In [6]:
users = raw_users[["age", "country"]].copy()
# users['metadata'] = json.dumps({})
users

Unnamed: 0,age,country
0,33,USA
1,60,UK
2,22,Ukraine
3,27,USA
4,54,France


In [7]:
# fill na
avg_score = int(items['AudienceScore'].mean())
items['AudienceScore'].fillna(avg_score, inplace=True) 
items.dropna(inplace=True) # we can't infer Genre
items.sample(5)

Unnamed: 0,LeadStudio,AudienceScore,Genre,Year
384,Paramount,66.0,Drama,2010
174,Paramount,38.0,Comedy,2008
287,Warner Bros,52.0,Horror,2009
130,Fox,71.0,Comedy,2008
430,Warner Bros,65.0,Animation,2010


### encoding

In [8]:
# items
genre_dummies = pd.get_dummies(items['Genre']).add_prefix('Genre_')
studio_dummies = pd.get_dummies(items['LeadStudio']).add_prefix('LeadStudio_')
# metadata = items['metadata']
items = pd.concat([items[[ 'AudienceScore', 'Year']], genre_dummies], axis=1)
items = pd.concat([items[[ 'AudienceScore', 'Year']], studio_dummies], axis=1)
# items = pd.concat([items, metadata], axis=1)
items.sample(5)

Unnamed: 0,AudienceScore,Year,LeadStudio_ARC Entertainment,LeadStudio_Aardman Animations,LeadStudio_Atlas Distribution,LeadStudio_Buena Vista,LeadStudio_CBS,LeadStudio_Cohen Media,LeadStudio_Columbia,LeadStudio_Crest,...,LeadStudio_Spyglass Entertainment,LeadStudio_Summit,LeadStudio_UTV,LeadStudio_Universal,LeadStudio_Vertigo,LeadStudio_Village Roadshow,LeadStudio_Virgin,LeadStudio_Warner Bros,LeadStudio_Weinstein,LeadStudio_Yash Raj
159,36.0,2008,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161,44.0,2008,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,85.0,2007,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
295,47.0,2009,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
523,38.0,2011,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
country_dummies = pd.get_dummies(users['country']).add_prefix('country_')
# metadata=users['metadata']
users = pd.concat([users[['age']], country_dummies], axis=1)
# users = pd.concat([users, metadata], axis=1)
users

Unnamed: 0,age,country_France,country_UK,country_USA,country_Ukraine
0,33,0,0,1,0
1,60,0,1,0,0
2,22,0,0,0,1
3,27,0,0,1,0
4,54,1,0,0,0


### normalization/scaling

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
items['AudienceScore'] = scaler.fit(items[['AudienceScore']]).transform(items[['AudienceScore']])
items['Year'] = scaler.fit(items[['Year']]).transform(items[['Year']])
items.sample(5)

Unnamed: 0,AudienceScore,Year,LeadStudio_ARC Entertainment,LeadStudio_Aardman Animations,LeadStudio_Atlas Distribution,LeadStudio_Buena Vista,LeadStudio_CBS,LeadStudio_Cohen Media,LeadStudio_Columbia,LeadStudio_Crest,...,LeadStudio_Spyglass Entertainment,LeadStudio_Summit,LeadStudio_UTV,LeadStudio_Universal,LeadStudio_Vertigo,LeadStudio_Village Roadshow,LeadStudio_Virgin,LeadStudio_Warner Bros,LeadStudio_Weinstein,LeadStudio_Yash Raj
325,-0.7699,-0.235223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
369,-1.074312,-0.235223,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
134,0.386867,-0.896364,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
546,-0.28284,1.087059,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,1.421868,-1.557505,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
scaler = StandardScaler()
users['age'] = scaler.fit(users[['age']]).transform(users[['age']])
users.sample(5)

Unnamed: 0,age,country_France,country_UK,country_USA,country_Ukraine
3,-0.809814,0,0,1,0
4,0.982397,1,0,0,0
0,-0.411545,0,0,1,0
1,1.380666,0,1,0,0
2,-1.141705,0,0,0,1


In [64]:
n_ratings = 100
ratings = pd.DataFrame([ {
  "user": randint(0, len(users) - 1), 
  "item": randint(0, len(items) - 1), 
  "rating": randint(1, 5)
  } for x in range(n_ratings) ])

In [65]:
ratings.sample(5)

Unnamed: 0,user,item,rating
9,0,147,2
75,3,498,1
43,4,360,3
90,4,667,5
31,0,162,2


In [66]:
users_df = users
items_df = items
ratings_df = ratings

In [67]:
from sklearn.preprocessing import LabelEncoder
# https://www.youtube.com/watch?v=Wj-nkk7dFS8
lbl_item = LabelEncoder()
lbl_user = LabelEncoder()
print(ratings_df['item'].values)
print(ratings_df['user'].values)
print()
ratings_df['item'] = lbl_item.fit_transform(ratings_df['item'].values)
ratings_df['user'] = lbl_user.fit_transform(ratings_df['user'].values)

print(ratings_df['item'].values)
print(ratings_df['user'].values)

[163 130  63 136  30 472  91 236 321 147 650   6 279 556 262 673 349 680
 192 643  34 339 124  18 328 175 175 368  42 274 422 162  78 548 132  77
 183 352 392 357 675 380 172 360 567 570 556 618 177 255 471 131 416 563
 298 494  63 647 440  99 301 125  26 197 628 481 637 672 286 606  57 315
 618 639 552 498 472 589 598 315  46  67 398 164 208 281 114  65 458 630
 667 419 497 513 306 218 619 178 313 577]
[4 4 1 1 0 3 2 4 0 0 0 2 0 2 1 2 0 0 4 1 3 3 2 2 0 2 1 1 2 1 1 0 2 0 4 0 0
 2 1 0 4 2 3 4 4 1 0 0 3 2 0 4 4 2 1 2 3 1 2 4 2 4 4 0 0 0 3 3 1 3 2 4 0 1
 3 3 1 3 0 3 2 0 0 4 3 0 0 4 1 0 4 0 1 3 0 2 0 1 0 1]

[24 18  8 21  3 64 13 35 47 22 88  0 39 72 37 91 50 93 31 86  4 49 16  1
 48 27 27 54  5 38 60 23 12 70 20 11 30 51 56 52 92 55 26 53 74 75 72 80
 28 36 63 19 58 73 42 66  8 87 61 14 43 17  2 32 82 65 84 90 41 79  7 46
 80 85 71 68 64 77 78 46  6 10 57 25 33 40 15  9 62 83 89 59 67 69 44 34
 81 29 45 76]
[4 4 1 1 0 3 2 4 0 0 0 2 0 2 1 2 0 0 4 1 3 3 2 2 0 2 1 1 2 1 1 0 2 0 4 0 0
 2 1 0 

## EDA

In [16]:
# Budget,	Profitability correlation

In [17]:
DEBUG = True

## Model

In [30]:
class TwoTowerRecSysModel(torch.nn.Module):
  def __init__(self, users_n, items_n) -> None:
    super().__init__()

    items_embed_dim = 32
    users_embed_dim = 32
    self.user_embed = torch.nn.Embedding(users_n, items_embed_dim)
    self.item_embed = torch.nn.Embedding(items_n, users_embed_dim)
    print("user embed: ", self.user_embed)
    print("item embed: ", self.item_embed)

    self.out = torch.nn.Linear(items_embed_dim + users_embed_dim, 1)

  def forward(self, users_batch, items_batch):
    user_embeds = self.user_embed(users_batch)
    item_embeds = self.item_embed(items_batch)
    output = self.out(torch.cat([user_embeds, item_embeds], dim=1))
    return output


## Dataset

In [74]:
class RecommendationDataset(torch.utils.data.Dataset):
  users: pd.DataFrame
  items: pd.DataFrame
  targets: pd.DataFrame

  def __init__(self, users, items, targets):
    self.users = users
    self.items = items
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, idx):
    users = self.users[idx]
    items = self.items[idx]
    targets = self.targets[idx]
    return {
      "users": torch.tensor(users, dtype=torch.long),
      "items": torch.tensor(items, dtype=torch.long),
      "targets": torch.tensor(targets, dtype=torch.float)
    }


In [75]:
from sklearn import model_selection

train_df, test_df = model_selection.train_test_split(ratings_df, test_size=0.2, stratify=ratings['rating'].values)

train_dataset = RecommendationDataset(
  users=torch.tensor(train_df['user'].values, dtype=torch.long), 
  items=torch.tensor(train_df['item'].values, dtype=torch.long),
  targets=torch.tensor(train_df['rating'].values, dtype=torch.long)
)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)


## Training

In [100]:
model = TwoTowerRecSysModel(users_n=len(lbl_user.classes_), items_n=len(lbl_item.classes_)).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3)
loss_func = torch.nn.MSELoss()

epochs_num = 100

user embed:  Embedding(5, 32)
item embed:  Embedding(94, 32)


In [101]:
for epoch in range(epochs_num):
  for batch in train_dataloader:
    optimizer.zero_grad()
    output = model(users_batch=batch['users'], items_batch=batch['items'])
    loss = loss_func(output, batch['targets'])
    loss.backward()
    optimizer.step()
  print(f"epoch: {epoch}  loss: {loss}")

  "users": torch.tensor(users, dtype=torch.long),
  "items": torch.tensor(items, dtype=torch.long),
  "targets": torch.tensor(targets, dtype=torch.float)


epoch: 0  loss: 9.892841339111328
epoch: 1  loss: 6.254524230957031
epoch: 2  loss: 9.420291900634766
epoch: 3  loss: 5.532872676849365
epoch: 4  loss: 1.4645683765411377
epoch: 5  loss: 4.663896083831787
epoch: 6  loss: 1.6198128461837769
epoch: 7  loss: 3.2459969520568848
epoch: 8  loss: 3.7395009994506836
epoch: 9  loss: 2.428483009338379
epoch: 10  loss: 7.076725006103516
epoch: 11  loss: 8.186853408813477
epoch: 12  loss: 0.9460499286651611
epoch: 13  loss: 1.326372504234314
epoch: 14  loss: 3.3269948959350586
epoch: 15  loss: 1.962905764579773
epoch: 16  loss: 1.2906997203826904
epoch: 17  loss: 1.033776879310608
epoch: 18  loss: 0.9898613095283508
epoch: 19  loss: 2.908263683319092
epoch: 20  loss: 2.535918951034546
epoch: 21  loss: 1.5349359512329102
epoch: 22  loss: 0.8333605527877808
epoch: 23  loss: 2.206411123275757
epoch: 24  loss: 3.330582618713379
epoch: 25  loss: 2.0853724479675293
epoch: 26  loss: 1.8528900146484375
epoch: 27  loss: 2.6134185791015625
epoch: 28  loss: 

## Manual forward pass

In [107]:
data_iter = iter(train_dataloader)
loader_data = next(data_iter)
loader_data

print(loader_data['users'])
print(loader_data['users'].size())
print()
print(loader_data['items'])
print(loader_data['items'].size())

print(len(train_df['user'].values), len(train_df['item'].values))

embed_dim = 32
user_embed = torch.nn.Embedding(len(lbl_user.classes_), embed_dim)
item_embed = torch.nn.Embedding(len(lbl_item.classes_), embed_dim)

out = torch.nn.Linear(embed_dim + embed_dim, 1)

user_embeds = user_embed(loader_data[ 'users'])
item_embeds = item_embed(loader_data[ 'items'])

print (f"user_embeds {user_embeds.size()}")
print(f"user_embeds {user_embeds}")
print()
print (f"item_embeds {item_embeds.size()}")
print(f"item_embeds {item_embeds}")


tensor([2, 3, 2, 1])
torch.Size([4])

tensor([13, 33, 55, 86])
torch.Size([4])
80 80
user_embeds torch.Size([4, 32])
user_embeds tensor([[ 8.9080e-01,  1.2105e+00,  2.3975e+00, -1.5065e+00, -2.2937e-02,
         -4.6114e-01, -1.1790e-01, -6.7916e-01,  9.1804e-01,  2.1451e-02,
          1.2172e-01, -2.3557e+00, -1.5572e+00,  2.5960e-01,  1.2634e+00,
         -1.4336e+00,  5.6231e-02,  2.2444e-01, -8.0931e-01, -6.9277e-02,
         -1.7217e+00,  1.3526e+00,  1.0294e+00,  1.0750e+00, -5.3172e-01,
         -5.2157e-01,  5.6282e-01, -6.2714e-01,  2.7370e+00,  5.5667e-01,
          2.0436e+00,  8.6552e-01],
        [-1.3301e+00,  6.4462e-01, -5.7701e-01, -5.4537e-02,  1.7986e+00,
         -1.0555e+00, -7.5855e-01,  2.2082e+00, -1.6567e+00, -6.0480e-01,
         -5.0195e-01, -1.4076e+00,  1.7699e+00,  1.0843e+00,  2.3624e+00,
          1.0786e+00, -2.1642e-01,  9.4298e-02,  1.0511e+00,  7.4449e-01,
         -1.4502e-01, -3.5270e-01, -1.2690e+00,  4.2999e-01, -1.0792e+00,
         -1.5340e-01,

  "users": torch.tensor(users, dtype=torch.long),
  "items": torch.tensor(items, dtype=torch.long),
  "targets": torch.tensor(targets, dtype=torch.float)


## Evaluation

## Prediction

In [122]:
print("Labels")
print(lbl_item.classes_)
print(lbl_user.classes_)


print()
user = 2
item = 498
scores = model(users_batch=torch.tensor(lbl_user.transform([user])), items_batch=torch.tensor(lbl_item.transform([item])))
print(f"Prediction: user {user} item {item} score: {scores.detach().numpy()[0][0]}")

Labels
[  6  18  26  30  34  42  46  57  63  65  67  77  78  91  99 114 124 125
 130 131 132 136 147 162 163 164 172 175 177 178 183 192 197 208 218 236
 255 262 274 279 281 286 298 301 306 313 315 321 328 339 349 352 357 360
 368 380 392 398 416 419 422 440 458 471 472 481 494 497 498 513 548 552
 556 563 567 570 577 589 598 606 618 619 628 630 637 639 643 647 650 667
 672 673 675 680]
[0 1 2 3 4]

Prediction: user 2 item 498 score: 3.199071168899536
