In [4]:
import os

In [5]:
%pwd

'/home/swaraj/PROJECTS/Mews-Reccomendation-System/Experimentation'

In [6]:
os.chdir('../')

In [7]:
%pwd

'/home/swaraj/PROJECTS/Mews-Reccomendation-System'

In [8]:
# ENTITY
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path_usr: Path
    data_path_news: Path


In [9]:
from News_Reccomendation_System.constants import *
from News_Reccomendation_System.utils.common import read_yaml, create_directories

[2023-10-25 10:57:09,599: INFO: common: yaml file: config/config.yaml loaded succesfully]


ConfigBox({'artifact_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'local_data_file': 'news.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'data_path_usr': 'artifacts/data_ingestion/news.tsv'}})

In [11]:
class ConfigurationManager:                  
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,                     # These were all defined in constants
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifact_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:

        config = self.config.data_transformation
        
        create_directories([config.root_dir])

        data_transformation_configuration = DataTransformationConfig(
            root_dir= config.root_dir,
            data_path_usr= config.data_path_usr,
            data_path_news= config.data_path_news
        )

        return data_transformation_configuration

In [16]:
import os
from News_Reccomendation_System import logger
from News_Reccomendation_System.utils.common import save_json
import pandas as pd
import numpy as np

class DataTransformtion:
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def user_data(self):
        df = pd.read_csv(self.config.data_path_usr,
                         sep="\t",
                         names=["impressionId",
                                "userId",
                                "timestamp",
                                "click_history",
                                "impressions"]
                                )
        return df
    
    def news_data(self):
        df = pd.read_csv(self.config.data_path_news,
                         sep='\t',
                         names=["itemId",
                                "category",
                                "subcategory",
                                "title","abstract",
                                "url",
                                "title_entities",
                                "abstract_entities"]
                                )
        return df
    
    def indexize_users(self, raw_behaviour: pd.DataFrame):
        ## Indexize users
        unique_userIds = raw_behaviour['userId'].unique()
        # Allocate a unique index for each user, but let the zeroth index be a UNK index:
        ind2user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds)}
        user2ind = {itemid : idx for idx, itemid in ind2user.items()}
        print(f"We have {len(user2ind)} unique users in the dataset")

        save_json(path= os.path.join(self.config.root_dir, 'ind2user.json'), data= ind2user)
        save_json(path= os.path.join(self.config.root_dir, 'user2ind.json'), data= user2ind)
        # Create a new column with userIdx:
        raw_behaviour['userIdx'] = raw_behaviour['userId'].map(lambda x: user2ind.get(x,0))
                
        return raw_behaviour
    
    def get_iten2ind_hash(self, news :pd.DataFrame):
        ind2item = {idx +1: itemid for idx, itemid in enumerate(news['itemId'].values)}
        item2ind = {itemid : idx for idx, itemid in ind2item.items()}

        save_json(path= os.path.join(self.config.root_dir, 'ind2uitem.json'), data= ind2item)
        save_json(path= os.path.join(self.config.root_dir, 'item2ind.json'), data= item2ind)

        return item2ind
    
    def indexise_click_history(self, item2ind: dict, raw_behaviour: pd.DataFrame):

        def process_click_history(s):
            list_of_strings = str(s).split(" ")
            return [item2ind.get(l, 0) for l in list_of_strings]

        raw_behaviour['click_history_idx'] = raw_behaviour.click_history.map(lambda s:  process_click_history(s))

        return raw_behaviour
    

    def one_click_no_click(self, item2ind: dict, raw_behaviour: pd.DataFrame):

        def process_impression(s):
            list_of_strings = s.split(" ")
            itemid_rel_tuple = [l.split("-") for l in list_of_strings]
            noclicks = []
            for entry in itemid_rel_tuple:
                if entry[1] =='0':
                    noclicks.append(entry[0])
                if entry[1] =='1':
                    click = entry[0]
            return noclicks, click

        raw_behaviour['noclicks'], raw_behaviour['click'] = zip(*raw_behaviour['impressions'].map(process_impression))
        # We can then indexize these two new columns:
        raw_behaviour['noclicks'] = raw_behaviour['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
        raw_behaviour['click'] = raw_behaviour['click'].map(lambda x: item2ind.get(x,0))

        return raw_behaviour


    def conver_datetime_to_hrs(self, raw_behaviour: pd.DataFrame):

        raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
        raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

        return raw_behaviour
    

    def get_user_behaviour(self, raw_behaviour: pd.DataFrame):

        raw_behaviour['noclick'] = raw_behaviour['noclicks'].map(lambda x : x[0])
        behaviour = raw_behaviour[['epochhrs','userIdx','click_history_idx','noclick','click']]

        return behaviour
    

    def train_test_spilt_behaviour(self, behaviour):

        # Let us use the last 10pct of the data as our validation data:
        test_time_th = behaviour['epochhrs'].quantile(0.9)
        train = behaviour[behaviour['epochhrs']< test_time_th]
        valid =  behaviour[behaviour['epochhrs']>= test_time_th]

        train.to_csv(os.path.join(self.config.root_dir, 'train.tsv'), index = False, sep= '\t')
        valid.to_csv(os.path.join(self.config.root_dir, 'valid.tsv'), index = False, sep= '\t')



In [13]:
STEP_NAME = '03 ---- Data Transformation Step'



class DataTransformationPipeline:
    def __init__(self) -> None:
        pass
    
    def main(self):
        


        config = ConfigurationManager()
        data_transformation_config = config.get_data_transformation_config()
        data_transformation = DataTransformtion(config= data_transformation_config)
        raw_behaviour = data_transformation.user_data()
        news = data_transformation.news_data()
        item2ind = data_transformation.get_iten2ind_hash(news= news)
        raw_behaviour = data_transformation.indexize_users(raw_behaviour= raw_behaviour)
        raw_behaviour = data_transformation.indexise_click_history(raw_behaviour= raw_behaviour, item2ind= item2ind)
        raw_behaviour = data_transformation.one_click_no_click(raw_behaviour= raw_behaviour, item2ind= item2ind)
        raw_behaviour = data_transformation.conver_datetime_to_hrs(raw_behaviour= raw_behaviour)
        behaviour = data_transformation.get_user_behaviour(raw_behaviour= raw_behaviour)
        data_transformation.train_test_spilt_behaviour(behaviour= behaviour)
            
            
            





def run_data_transformation():
    try:
        logger.info(f' >>>>>>> Step {STEP_NAME} started <<<<<<<<<<<')
        obj = DataTransformationPipeline()
        obj.main()
        logger.info(f' >>>>>>> Step {STEP_NAME} completed <<<<<<<<<<<\n\nx====================x')

    except Exception as e:
            logger.exception(e)
            raise e


In [17]:
run_data_transformation()

[2023-10-25 10:58:54,858: INFO: 1062367176:  >>>>>>> Step 03 ---- Data Transformation Step started <<<<<<<<<<<]
[2023-10-25 10:58:54,876: INFO: common: yaml file: config/config.yaml loaded succesfully]
[2023-10-25 10:58:54,889: INFO: common: yaml file: params.yaml loaded succesfully]
[2023-10-25 10:58:54,898: INFO: common: yaml file: schema.yaml loaded succesfully]
[2023-10-25 10:58:54,904: INFO: common: Created directory at : artifacts]
[2023-10-25 10:58:54,934: INFO: common: Created directory at : artifacts/data_transformation]
We have 50000 unique users in the dataset
[2023-10-25 10:59:38,071: INFO: 1062367176:  >>>>>>> Step 03 ---- Data Transformation Step completed <<<<<<<<<<<

