In [37]:
import os

In [38]:
%pwd

'/home/swaraj/PROJECTS/Mews-Reccomendation-System/Experimentation'

In [39]:
os.chdir('../')
%pwd

'/home/swaraj/PROJECTS/Mews-Reccomendation-System'

In [56]:
#  Entity

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    news: Path
    model_name: str
    model_content: str
    ind2user: Path
    ind2item: Path
    batch_size: float


In [57]:
#  Cpnfiguration Manager
from News_Reccomendation_System.constants import *
from News_Reccomendation_System.utils.common import read_yaml, create_directories
class ConfigurationManager:                  
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,                     # These were all defined in constants
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifact_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.Behaviour_model
        

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir= config.root_dir,
            train_data_path= config.train_data_path,
            test_data_path= config.test_data_path,
            model_name= config.model_name,
            model_content= config.model_content,
            news= config.news,
            ind2user= config.ind2user,
            ind2item= config.ind2item,
            batch_size= params.batch_size
            
        )

        return model_trainer_config

In [65]:
#  Components

import os
from typing import Any
from pathlib import Path
from News_Reccomendation_System import logger
from News_Reccomendation_System.utils.common import load_json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import joblib

import ktrain


class MindDataset(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values),
            'click' : torch.tensor(df.click.values),
            'noclick' : torch.tensor(df.noclick.values)
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}
    

# Build a matrix factorization model
class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)
    
    def forward(self, user, item):
        batch_size = user.size(0)
        uservec = self.useremb(user)
        itemvec = self.itememb(item)

        score = (uservec*itemvec).sum(-1).unsqueeze(-1)
        
        return score
    
    def training_step(self, batch, batch_idx):
        batch_size = batch['userIdx'].size(0)

        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])
        
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        # Compute loss as cross entropy (categorical distribution between the clicked and the no clicked item)
        loss = F.cross_entropy(input=scores_all, target=torch.zeros(batch_size, device=scores_all.device).long())
        return loss
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        loss = self.training_step(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


class ModelTrainer:

    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def get_data(self):
        train = pd.read_csv(self.config.train_data_path, sep= '\t')
        valid = pd.read_csv(self.config.test_data_path, sep= '\t')

        return train, valid

    def get_hashes(self):
        ind2user = load_json(Path(self.config.ind2user))
        ind2item = load_json(Path(self.config.ind2item))

        return ind2item, ind2user
            

    def build_datasets(self, train, valid):
        bs = self.config.batch_size
        ds_train = MindDataset(train)
        train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
        ds_valid = MindDataset(valid)
        valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)

        # batch = next(iter(train_loader))
        
        return train_loader, valid_loader
    
    def model_training(self, train_loader, valid_loader, ind2item, ind2user):

        mf_model = NewsMF(num_users=len(ind2user)+1, num_items = len(ind2item)+1)
    
        trainer = pl.Trainer(max_epochs=10)
        trainer.fit(model=mf_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

        joblib.dump(mf_model, os.path.join(self.config.root_dir, self.config.model_name))


    def model_training_2(self):
        df = pd.read_csv(self.config.news, sep= '\t',
                         names=["itemId","category",
                                "subcategory",
                                "title",
                                "abstract",
                                "url",
                                "title_entities",
                                "abstract_entities"])

        df.dropna(inplace= True)
        df.drop_duplicates(inplace= True)
        df['article'] = df.apply(lambda row: row['title'] + row['abstract'], axis= 1)
        corpus = list(df['article'])
        del df
        tm = ktrain.text.get_topic_model(texts= corpus, n_features= 100000)
        tm.build(corpus, threshold= 0.25)
        tm.train_recommender()

        joblib.dump(tm, os.path.join(self.config.root_dir, self.config.model_content))


In [66]:
STEP_NAME = '04 ---- Model Training Step'



class ModelTrainingPipeline:
    def __init__(self) -> None:
        pass

    def main(self):
        config = ConfigurationManager()
        model_trainer_config = config.get_model_trainer_config()
        model_trainer = ModelTrainer(config= model_trainer_config)
        train, valid = model_trainer.get_data()
        ind2item, ind2user = model_trainer.get_hashes()
        train_loader, valid_loader = model_trainer.build_datasets(train= train,
                                                                  valid= valid)
        model_trainer.model_training(train_loader= train_loader,
                                     valid_loader= valid_loader,
                                     ind2item= ind2item,
                                     ind2user= ind2user)
        model_trainer.model_training_2()
        
        


def run_model_trainer():

    try:
        logger.info(f' >>>>>>> Step {STEP_NAME} started <<<<<<<<<<<')
        obj = ModelTrainingPipeline()
        obj.main()
        logger.info(f' >>>>>>> Step {STEP_NAME} completed <<<<<<<<<<<\n\nx====================x')

    except Exception as e:
            logger.exception(e)
            raise e

In [67]:
run_model_trainer()

[2023-10-25 12:27:37,301: INFO: 1146969740:  >>>>>>> Step 04 ---- Model Training Step started <<<<<<<<<<<]
[2023-10-25 12:27:37,314: INFO: common: yaml file: config/config.yaml loaded succesfully]
[2023-10-25 12:27:37,326: INFO: common: yaml file: params.yaml loaded succesfully]
[2023-10-25 12:27:37,330: INFO: common: yaml file: schema.yaml loaded succesfully]
[2023-10-25 12:27:37,333: INFO: common: Created directory at : artifacts]
[2023-10-25 12:27:37,336: INFO: common: Created directory at : artifacts/model_trainer]


[2023-10-25 12:27:38,301: INFO: common: JSON file loaded succesfully from: artifacts/data_transformation/ind2user.json]
[2023-10-25 12:27:39,401: INFO: common: JSON file loaded succesfully from: artifacts/data_transformation/ind2uitem.json]
[2023-10-25 12:27:40,289: INFO: setup: GPU available: False, used: False]
[2023-10-25 12:27:40,291: INFO: setup: TPU available: False, using: 0 TPU cores]
[2023-10-25 12:27:40,293: INFO: setup: IPU available: False, using: 0 IPUs]
[2023-10-25 12:27:40,299: INFO: setup: HPU available: False, using: 0 HPUs]
[2023-10-25 12:27:40,479: INFO: model_summary: 
  | Name    | Type      | Params
--------------------------------------
0 | useremb | Embedding | 500 K 
1 | itememb | Embedding | 512 K 
--------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.051     Total estimated model params size (MB)]


Sanity Checking: |                                                                                            …

/home/swaraj/PROJECTS/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/home/swaraj/PROJECTS/venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

[2023-10-25 12:29:07,666: INFO: fit_loop: `Trainer.fit` stopped: `max_epochs=10` reached.]
n_topics automatically set to 155
lang: en
preprocessing texts...
fitting model...
iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5
done.
done.
[2023-10-25 12:35:46,677: INFO: 1146969740:  >>>>>>> Step 04 ---- Model Training Step completed <<<<<<<<<<<

