In [None]:
import os
os.chdir("../")

In [None]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file

DATA_CONFIG_FILE_PATH = os.environ['DATA_CONFIG_FILE_PATH']

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    hf_dataset_name: str
    hf_dataset_split: str
    local_data_file: Path
    unzip_dir: Path   

In [None]:
from src.utils.common import read_yaml

class ConfigurationManager:
    def __init__(self,
                 config_filepath = DATA_CONFIG_FILE_PATH):

        self.config = read_yaml(Path(config_filepath))


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion


        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            hf_dataset_name=config.hf_dataset_name,
            hf_dataset_split=config.hf_dataset_split,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [None]:
import zipfile
import pandas as pd
from datasets import load_dataset
from src.logging import logger

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            raw_dataset = load_dataset(self.config.hf_dataset_name, split=self.config.hf_dataset_split)
            df = pd.DataFrame(raw_dataset)  
            
            with zipfile.ZipFile(self.config.local_data_file, 'w') as z:
                df.to_csv('raw_dataset.csv', index=False)  # Save DataFrame to CSV file
                z.write('raw_dataset.csv')  # Write CSV file to the zip archive
            
            os.remove('raw_dataset.csv')  # Remove the temporary CSV file after zipping
            logger.info(f"Dataset {self.config.hf_dataset_name} downloaded and archived as data.zip!")
        else:
            logger.info(f"File already exists. File size: {Path(self.config.local_data_file).stat().st_size}")

        
    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
        logger.info(f"Data extracted at {unzip_path}")

In [None]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e