In [1]:
import json
import pandas as pd
import numpy as np

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from dataset import ProgramDataset
from architectures import NonLinearModel, CossineSimilarityBlock, MetaModel
from utils import RMSELoss, train, predict

In [2]:
TOPICS = ['Entertainment', 'Pop Culture', 'Personal Finance', 'Business and Finance', 'Education', 'Sensitive Topics', 'Video Gaming', 'Shopping', 'Pets', 'Automotive', 'Personal Celebrations & Life Events', 'Environment', 'War and Conflicts', 'Style & Fashion', 'Events & Attractions', 'Communication', 'Religion & Spirituality', 'Politics', 'Movies & Music', 'Food & Drink', 'Healthy Living', 'Real Estate', 'Home & Garden', 'Hobbies & Interests', 'Family and Relationships', 'Productivity', 'Careers', 'Books and Literature', 'Maps & Navigation', 'Travel', 'Crime, Law & Justice', 'Sports', 'Disasters', 'Fine Art', 'Medical Health']
personas_vectors = []
programs_vectors = []

personas_info = {}
programs_info = {}

In [3]:
# load personas interest rates by topic
with open('../data/personas/personas.json', 'rb') as f:
    personas = json.load(f)

    for p in personas:
        persona_vector = [0] * len(TOPICS)
        
        for topic in p["interests"]:
            persona_vector[TOPICS.index(topic)] = p["interests"][topic]
        
        personas_vectors.append({"name": p["name"], "vector": persona_vector})

        personas_info[p["name"]] = p["interests"]

In [4]:
# load programs interest rates by topic
with open('../data/programs/programs_info.json', 'rb') as f:
    programs = json.load(f)

    for p in programs:
        program_vector = [0] * len(TOPICS)
        
        for topic in p["Topics"]:
            program_vector[TOPICS.index(topic["description"])] = topic["percentage"]

        programs_vectors.append({"name": p["Title"], "vector": program_vector})

        programs_info[p["Title"]] = p

In [5]:
# compute the similarity between each persona and each program
similarity_matrix = []

for p in personas_vectors:
    row = {}

    for pr in programs_vectors:
        similarity = np.dot(p["vector"], pr["vector"]) / (np.linalg.norm(p["vector"]) * np.linalg.norm(pr["vector"])) # cosine similarity

        row[pr["name"]] = similarity

    similarity_matrix.append(row)

df = pd.DataFrame(similarity_matrix, index=[p["name"] for p in personas_vectors])

In [6]:
# constants for training
batch_size = 1 # equivalent to stochastic gradient descent
num_users = len(personas_vectors)
num_items = len(programs_vectors)
embedding_dim = len(TOPICS) # reduce to create a latent space
learning_rate = 1e-3
num_epochs = 10

# load dataset with similarities
dataset = ProgramDataset(df)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cos_block = CossineSimilarityBlock(device, num_users, num_items, embedding_dim, personas_vectors, programs_vectors).to(device)
model = MetaModel(device, [cos_block]).to(device)

# optimizer = optim.Adam(cos_block.parameters(), lr=learning_rate)
criterion = nn.MSELoss()
metric = RMSELoss()

In [7]:
# train cos_block
train(cos_block, loader, criterion, device, has_grad=False)

Training: 100%|██████████| 282/282 [00:00<00:00, 10748.96it/s]


0.0

In [11]:
# predict for "David"
user_id = df.index.get_loc("Liam")

# (item_id, predicted_similarity)
predictions = [(program, predict(cos_block, user_id, df.columns.get_loc(program), device)) for program in df.columns]
predictions.sort(key=lambda x: x[1], reverse=True)

programs_obj = [programs_info[program] for program, _ in predictions]

data_obj = {
    "preferences": personas_info["Liam"],
    "programs": programs_obj
}

with open('../data/liam_example.json', 'w') as f:
    json.dump(data_obj, f, indent=4)

predictions

[('Joker T8 - Ep. 70', 0.45547938346862793),
 ('A Nossa Tarde', 0.3947761654853821),
 ('Eucaristia Dominical', 0.39351749420166016),
 ('Hora Da Sorte - Lotaria Popular - Ep. 40', 0.3868962824344635),
 ('Cá Por Casa com Herman José T11 - Ep. 2', 0.3803417682647705),
 ('Joker T8 - Ep. 71', 0.37670841813087463),
 ('Dillaz: Festival F 2024', 0.37382060289382935),
 ('The Voice Portugal - Os Apurados T12 - Ep. 1', 0.36253830790519714),
 ('O Preço Certo', 0.36206498742103577),
 ('Amor Sem Igual - Ep. 32', 0.3578277826309204),
 ('Delfins - 40 Anos', 0.3540067672729492),
 ('Nunca Nada Aconteceu', 0.346403032541275),
 ('Hora Da Sorte - Lotaria Clássica - Ep. 40', 0.34089580178260803),
 ('Terra Europa T1 - Ep. 48', 0.3274747133255005),
 ('Outras Histórias T6 - Ep. 12', 0.31296253204345703),
 ('Alguém Tem De O Fazer T1 - Ep. 12', 0.25431835651397705),
 ('É Ou Não É? - O Grande Debate', 0.24792912602424622),
 ('Praça da Alegria', 0.23324181139469147),
 ('Aqui Portugal: Pombal (Tarde)', 0.2255329042

In [9]:
# train meta model
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train(model, loader, criterion, device, optimizer=optimizer, has_grad=True)

  from .autonotebook import tqdm as notebook_tqdm
  return F.mse_loss(input, target, reduction=self.reduction)
Training: 100%|██████████| 282/282 [00:00<00:00, 1718.39it/s]


0.03412113970567915

In [10]:
predictions = [(program, predict(model, david_id, df.columns.get_loc(program), device)) for program in df.columns]
predictions.sort(key=lambda x: x[1], reverse=True)
predictions

[('Telejornal', 0.24969489872455597),
 ('Mesa Portuguesa... Com Estrelas Com Certeza - Ep. 10', 0.24618609249591827),
 ('Eu Ainda Acredito', 0.24028588831424713),
 ('Bom Dia Portugal', 0.23890580236911774),
 ('Amor Sem Igual - Ep. 33', 0.22985415160655975),
 ('Aqui Portugal: Pombal (Manhã)', 0.22512836754322052),
 ('Alguém Tem De O Fazer T1 - Ep. 11', 0.22438804805278778),
 ('Grandiosa Enciclopédia Do Ludopédio T10 - Ep. 4', 0.21192528307437897),
 ('Jornal da Tarde', 0.206189826130867),
 ('O Conto Do Nadador', 0.15764163434505463),
 ('Bom Dia Portugal Fim de Semana', 0.1496964544057846),
 ('The Voice Portugal T12 - Ep. 2', 0.14456506073474884),
 ('Nunca Nada Aconteceu', 0.1305476725101471),
 ('Portugal em Direto', 0.08079494535923004),
 ('Joker T8 - Ep. 70', 0.08010594546794891),
 ('Joker T8 - Ep. 68', 0.07616651058197021),
 ('Amor Sem Igual - Ep. 34', 0.07053452730178833),
 ('Eucaristia Dominical', 0.07053224742412567),
 ('Outras Histórias T6 - Ep. 12', 0.07040758430957794),
 ('Amor S