In [1]:
import json
import pandas as pd
import numpy as np

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from dataset import ProgramDataset
from architectures import NonLinearModel, CossineSimilarityBlock, MetaModel
from utils import RMSELoss, train, predict

In [2]:
TOPICS = ['Books and Literature', 'Environment', 'Careers', 'Healthy Living', 'Shopping', 'Automotive', 'Sensitive Topics', 'Politics', 'Events & Attractions', 'Maps & Navigation', 'Personal Celebrations & Life Events', 'Style & Fashion', 'Sports', 'Business and Finance', 'War and Conflicts', 'Hobbies & Interests', 'Education', 'Food & Drink', 'Real Estate', 'Crime, Law & Justice', 'Communication', 'Family and Relationships', 'Disasters', 'Home & Garden', 'Video Gaming', 'Pets', 'Medical Health', 'Entertainment', 'Fine Art', 'Religion & Spirituality', 'Travel']

personas_vectors = []
programs_vectors = []

personas_info = {}
programs_info = {}

In [3]:
# load personas interest rates by topic
with open('../data/personas/personas.json', 'rb') as f:
    personas = json.load(f)

    for p in personas:
        persona_vector = [0] * len(TOPICS)
        
        for topic in p["interests"]:
            persona_vector[TOPICS.index(topic)] = p["interests"][topic]
        
        personas_vectors.append({"name": p["name"], "vector": persona_vector})

        personas_info[p["name"]] = p["interests"]

In [4]:
# load programs interest rates by topic
with open('../data/programs/programs_info.json', 'rb') as f:
    programs = json.load(f)

    for p in programs:
        program_vector = [0] * len(TOPICS)
        
        for topic in p["Topics"]:
            program_vector[TOPICS.index(topic["description"])] = topic["percentage"]

        programs_vectors.append({"name": p["Title"], "vector": program_vector})

        programs_info[p["Title"]] = p

In [5]:
# compute the similarity between each persona and each program
similarity_matrix = []

for p in personas_vectors:
    row = {}

    for pr in programs_vectors:
        similarity = np.dot(p["vector"], pr["vector"]) / (np.linalg.norm(p["vector"]) * np.linalg.norm(pr["vector"])) # cosine similarity

        row[pr["name"]] = similarity

    similarity_matrix.append(row)

df = pd.DataFrame(similarity_matrix, index=[p["name"] for p in personas_vectors])

In [6]:
# constants for training
batch_size = 1 # equivalent to stochastic gradient descent
num_users = len(personas_vectors)
num_items = len(programs_vectors)
embedding_dim = len(TOPICS) # reduce to create a latent space
learning_rate = 1e-3
num_epochs = 10

# load dataset with similarities
dataset = ProgramDataset(df)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cos_block = CossineSimilarityBlock(device, num_users, num_items, embedding_dim, personas_vectors, programs_vectors).to(device)
model = MetaModel(device, [cos_block]).to(device)

# optimizer = optim.Adam(cos_block.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
metric = RMSELoss()

  return torch._C._cuda_getDeviceCount() > 0


In [7]:
# train cos_block
train(cos_block, loader, criterion, device, has_grad=False)

Training: 100%|██████████| 84/84 [00:00<00:00, 4345.20it/s]


0.0

In [8]:
# predict for "David"
david_id = df.index.get_loc("David")

# (item_id, predicted_similarity)
predictions = [(program, predict(cos_block, david_id, df.columns.get_loc(program), device)) for program in df.columns]
predictions.sort(key=lambda x: x[1], reverse=True)
predictions

[('Jornal da Tarde', 0.9264534115791321),
 ('Telejornal', 0.8481944799423218),
 ('Terra Europa T1 - Ep. 48', 0.8219571113586426),
 ('Bom Dia Portugal', 0.7570829391479492),
 ('Portugal em Direto', 0.2942794859409332),
 ('Joker T8 - Ep. 70', 0.059705331921577454),
 ('Outras Histórias T6 - Ep. 12', 0.009785204194486141),
 ('Praça da Alegria', 0.0048783915117383),
 ('Amor Sem Igual - Ep. 33', 0.004161442164331675),
 ('Cá Por Casa com Herman José T11 - Ep. 2', 0.0),
 ('O Preço Certo', 0.0),
 ('A Nossa Tarde', 0.0),
 ('Amor Sem Igual - Ep. 34', 0.0),
 ('Televendas', 0.0)]

In [9]:
# train meta model
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train(model, loader, criterion, device, optimizer=optimizer, has_grad=True)

  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 84/84 [00:00<00:00, 639.20it/s]


0.045673059627727663

In [10]:
predictions = [(program, predict(model, david_id, df.columns.get_loc(program), device)) for program in df.columns]
predictions.sort(key=lambda x: x[1], reverse=True)
predictions

[('Jornal da Tarde', 0.5943422317504883),
 ('Telejornal', 0.5620748400688171),
 ('Terra Europa T1 - Ep. 48', 0.5510947108268738),
 ('Bom Dia Portugal', 0.5218514800071716),
 ('Portugal em Direto', 0.22486676275730133),
 ('Joker T8 - Ep. 70', 0.05861091613769531),
 ('Outras Histórias T6 - Ep. 12', 0.02604398876428604),
 ('Praça da Alegria', 0.02291850745677948),
 ('Amor Sem Igual - Ep. 33', 0.02246183156967163),
 ('Cá Por Casa com Herman José T11 - Ep. 2', 0.019862361252307892),
 ('O Preço Certo', 0.019862361252307892),
 ('A Nossa Tarde', 0.019862361252307892),
 ('Amor Sem Igual - Ep. 34', 0.019862361252307892),
 ('Televendas', 0.019862361252307892)]