# Note:
    Check how model will interact via API

# import libs

In [1]:
import os, sys
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import DataLoader, Dataset
import torch

from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
from sklearn.metrics import accuracy_score

# Default data

In [2]:
df_users = pd.read_parquet('df_users_embedings.parquet')
df_users.rename(columns={"id":"user_id"}, inplace=True)
df_posts = pd.read_parquet('df_posts_new_features_and_post.parquet')

# Default setting

In [3]:
columns = ['post_id',
 "text",
 "topic",
 'topic_covid',
 'topic_entertainment',
 'topic_movie',
 'topic_politics',
 'topic_sport',
 'topic_tech',
 'kmean_label',
 'DistanceToCluster_0',
 'DistanceToCluster_1',
 'DistanceToCluster_2',
 'DistanceToCluster_3',
 'DistanceToCluster_4',
 'DistanceToCluster_5',
 'DistanceToCluster_6',
 'DistanceToCluster_7',
 'DistanceToCluster_8',
 'DistanceToCluster_9',
 'DistanceToCluster_10',
 'DistanceToCluster_11',
 'DistanceToCluster_12',
 'DistanceToCluster_13',
 'DistanceToCluster_14',
 'gender',
 'age',
 'Belarus',
 'Cyprus',
 'Estonia',
 'Finland',
 'Kazakhstan',
 'Latvia',
 'Russia',
 'Switzerland',
 'Turkey',
 'Ukraine',
 'exp_1',
 'exp_2',
 'exp_3',
 'exp_4',
 'Android',
 'iOS',
 'ads',
 'organic']

In [4]:
limit = 5

In [5]:
model = torch.jit.load("CustomResidualNN_2024_CPU.pth")

# API recommendaitons

#### Define user_id

In [6]:
id_user = 4001

#### Create cartesian project between user and all posts

In [7]:
user_info_row = df_users[df_users.user_id == id_user]

In [8]:
temp_df = pd.merge(user_info_row, df_posts, how='cross')[columns]

In [9]:
temp_df.head(2)

Unnamed: 0,post_id,text,topic,topic_covid,topic_entertainment,topic_movie,topic_politics,topic_sport,topic_tech,kmean_label,...,Turkey,Ukraine,exp_1,exp_2,exp_3,exp_4,Android,iOS,ads,organic
0,1,UK economy facing major risks\n\nThe UK manufa...,business,False,False,False,False,False,False,2,...,False,False,False,False,True,False,True,False,True,False
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,False,False,False,False,False,False,2,...,False,False,False,False,True,False,True,False,True,False


#### Convert temp df to a pytorch class

In [10]:
class CustomImageDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        data = self.df.loc[idx].iloc[3:]
        return torch.Tensor(data).unsqueeze(0)

In [11]:
loader = DataLoader(CustomImageDataset(df=temp_df.copy()), batch_size=64, pin_memory=True, num_workers=4, shuffle=False)

#### Load Model && get prediciton probs

In [12]:
@torch.inference_mode()
def model_inference(data) -> list[float]:
    model.eval()
    return list(model(data).view(data.shape[0]).detach().numpy())

In [13]:
def get_probs(loader) -> list[float]:
    data = []
    for x in loader:
        data.extend(model_inference(x))
    return data

In [14]:
probs = get_probs(loader)

  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)


#### Create Recomendations based on probs

In [15]:
temp_df["probs"] = probs
temp_df.sort_values(by=["probs"], ascending=False, inplace=True)

In [16]:
post_recomended_df = temp_df[["post_id","text","topic"]].head(5)

In [17]:
post_recomended_df

Unnamed: 0,post_id,text,topic
370,317,Borussia Dortmund near bust\n\nGerman football...,business
178,153,Tsunami to hit Sri Lanka banks\n\nSri Lankas b...,business
204,179,Ad sales boost Time Warner profit\n\nQuarterly...,business
5992,6243,Clint Eastwood reprises his role as Dirty Harr...,movie
6077,6333,Hello Dave Burning Paradise is a film for anyo...,movie


#### Convert post df to json

In [18]:
def data_to_json(data: pd.DataFrame) -> list:
    return [{"id": sample_of_data.post_id,
             "text": sample_of_data.text,
             "topic": sample_of_data.topic} for sample_of_data in data.itertuples()]

In [19]:
data_to_json(post_recomended_df)

[{'id': 317,
  'text': 'Borussia Dortmund near bust\n\nGerman football club and former European champion Borussia Dortmund has warned it will go bankrupt if rescue talks with creditors fail.\n\nThe companys shares tumbled after it said it has entered a life-threatening profitability and financial situation. Borussia Dortmund has posted record losses and missed rent payments on its Westfallen stadium. Chief executive Gerd Niebaum stepped down last week and creditors are now pushing for greater control. Shares in Borussia Dortmund, Germanys only stock-market listed football club, dropped by almost 23% to 2.05 euros during early afternoon trading.\n\nFund manager Florian Hamm - Borussia Dortmunds largest investor - said he would only invest more money in the company if he got a greater say in how it is run. I demand better transparency, he is quoted as saying by Germanys Manger Magazin. The club has also faced calls to appoint executives from outside the club.\n\nBorussia Dortmund posted 

# Class to standart pipline infr

In [6]:
class CustomImageDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        data = self.df.loc[idx].iloc[3:]
        return torch.Tensor(data).unsqueeze(0)

class ModelInference:
    def __init__(self,
                 model: object = None,
                 df_users: pd.DataFrame = None,
                 df_posts: pd.DataFrame = None,
                 user_index: int = None,
                 limit: int = None):
        self.columns = ['post_id', "text", "topic", 'topic_covid','topic_entertainment','topic_movie', 'topic_politics', 'topic_sport',
 'topic_tech', 'kmean_label', 'DistanceToCluster_0', 'DistanceToCluster_1', 'DistanceToCluster_2', 'DistanceToCluster_3', 'DistanceToCluster_4',
 'DistanceToCluster_5', 'DistanceToCluster_6', 'DistanceToCluster_7', 'DistanceToCluster_8', 'DistanceToCluster_9', 'DistanceToCluster_10',
 'DistanceToCluster_11', 'DistanceToCluster_12', 'DistanceToCluster_13', 'DistanceToCluster_14', 'gender', 'age', 'Belarus', 'Cyprus',
 'Estonia', 'Finland', 'Kazakhstan', 'Latvia', 'Russia', 'Switzerland', 'Turkey', 'Ukraine',
 'exp_1', 'exp_2', 'exp_3', 'exp_4', 'Android', 'iOS', 'ads', 'organic']
        self.df_users = df_users.copy()
        self.df_posts = df_posts.copy()
        self.user_index = user_index
        self.limit = limit

    def create_cartesian_product(self) -> pd.DataFrame:
        """Create matrix product between user info and all posts via cross join by user id."""
        user_info_row = self.df_users[self.df_users.user_id == self.user_index]
        return pd.merge(user_info_row, self.df_posts, how='cross')[self.columns]

    def create_torch_dataset(self, data) -> object:
        return DataLoader(CustomImageDataset(df=data), batch_size=64, pin_memory=True, num_workers=4, shuffle=False)

    @torch.inference_mode()
    def model_inference(self, data) -> list[float]:
        model.eval()
        return list(model(data).view(data.shape[0]).detach().numpy())

    def get_probs(self, loader) -> list[float]:
        data = []
        for x in loader:
            data.extend(self.model_inference(x))
        return data

    def data_to_json(self, data: pd.DataFrame) -> list:
        return [{"id": sample_of_data.post_id,
                 "text": sample_of_data.text,
                 "topic": sample_of_data.topic} for sample_of_data in data.itertuples()]

    def predict(self) -> list:
        data = self.create_cartesian_product()
        loader = self.create_torch_dataset(data)
        probs = self.get_probs(loader)
        data["probs"] = probs
        data.sort_values(by=["probs"], ascending=False, inplace=True)
        return self.data_to_json(data.head(self.limit))

In [8]:
ModelInference(df_users=df_users, df_posts=df_posts, model=model, user_index=202, limit=5).predict()

  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)
  return torch.Tensor(data).unsqueeze(0)


[{'id': 1616,
  'text': 'QPR keeper Day heads for Preston\n\nQueens Park Rangers keeper Chris Day is set to join Preston on a months loan.\n\nDay has been displaced by the arrival of Simon Royce, who is in his second month on loan from Charlton. QPR have also signed Italian Generoso Rossi. Rs manager Ian Holloway said: Some might say its a risk as he cant be recalled during that month and Simon Royce can now be recalled by Charlton. But I have other irons in the fire. I have had a yes from a couple of others should I need them.\n\nDays Rangers contract expires in the summer. Meanwhile, Holloway is hoping to complete the signing of Middlesbrough defender Andy Davies - either permanently or again on loan - before Saturdays match at Ipswich. Davies impressed during a recent loan spell at Loftus Road. Holloway is also chasing Bristol City midfielder Tom Doherty.\n',
  'topic': 'sport'},
 {'id': 1313,
  'text': 'MPs issued with Blackberry threat\n\nMPs will be thrown out of the Commons if t