## Imports

In [1]:
from dotenv import load_dotenv
from pathlib import Path
import subprocess
import zipfile
import sys
import os
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Redireccionamos el path para importar desde src
current_dir = os.path.dirname(os.path.abspath('ETL.ipynb'))
src_dir = os.path.join(current_dir, '..')
if src_dir not in sys.path:
    sys.path.append(src_dir)

from src.misc import ParameterControl, safe_execution

pc = ParameterControl()

# Extract

## Config

In [3]:
with safe_execution():
    # Cargamos el archivo .env
    load_dotenv(dotenv_path=Path(pc.get_path('env_file')))

    # Nos aseguramos de que el archivo "kaggle.json" este bien configurado
    if not os.path.isfile("~/.kaggle/kaggle.json"):
        os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
        with open(os.path.expanduser("~/.kaggle/kaggle.json"), 'w') as file:
            file.write('{"username":' + os.getenv('KAGGLE_USERNAME') + ',"key":' + os.getenv('KAGGLE_KEY') + '}')

    # Creamos una carpeta temporal .temp
    os.makedirs(os.path.expanduser(pc.get_path("temp_folder")), exist_ok=True)

## Data Download

In [40]:
with safe_execution():
    if sum(len(files) for _, _, files in os.walk(pc.get_path("titanic_dataset_raw"))) == 0:
        # Descargamos el dataset desde kaggle a la carpeta temporal.
        command = ['kaggle', 'competitions', 'download', '-c', 'titanic', '-p', pc.get_path("temp_folder")]
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        print(stdout.decode())
        if stderr:
            print("Errors:", stderr.decode())

        # Descomprimimos el dataset en bronze
        with zipfile.ZipFile(Path(pc.get_path("titanic_dataset_zip")), 'r') as zip_ref:
            zip_ref.extractall(pc.get_path("titanic_dataset_raw"))

        # Eliminamos el archivo .zip en temp
        os.remove(pc.get_path("titanic_dataset_zip"))

## Data Load

In [41]:
with safe_execution():
    df_train = pd.read_csv(pc.get_path('titanic_dataset_train'))
    df_test = pd.read_csv(pc.get_path('titanic_dataset_test'))

# Transform

## Encode

In [43]:
with safe_execution():
    le = LabelEncoder()

    def label_encode(df, column):
        mask = df[column].notnull()
        df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
        df[column] = df[column].astype(float)

    label_encode(df_train, 'Cabin')
    label_encode(df_train, 'Name')
    label_encode(df_train, 'Sex')
    label_encode(df_train, 'Ticket')
    label_encode(df_train, 'Embarked')

    label_encode(df_test, 'Cabin')
    label_encode(df_test, 'Name')
    label_encode(df_test, 'Sex')
    label_encode(df_test, 'Ticket')
    label_encode(df_test, 'Embarked')

  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])
  df.loc[mask, column] = le.fit_transform(df.loc[mask, column])


## Impute

In [44]:
with safe_execution():
    def KNN_impute(df: pd.DataFrame, n_neighbors: int = 5) -> pd.DataFrame:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        imputer = KNNImputer(n_neighbors=n_neighbors)
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    _dtypes_train = df_train.dtypes.to_dict()

    KNN_impute(df_train)

    _dtypes_test = df_test.dtypes.to_dict()

    KNN_impute(df_test)

## Outliers

In [45]:
with safe_execution():
    def impute_outliers(df: pd.DataFrame, columns: list[str] | str, lower_threshold: float = 1.5, upper_threshold: float = 1.5) -> pd.DataFrame:
        """
        Impute outliers in the specified column(s) of the DataFrame using the IQR method.
        
        :param df: pd.DataFrame - The input DataFrame.
        :param columns: list[str] | str - List of column names or a single column name to impute outliers.
        :param lower_threshold: float - Lower threshold for the IQR method (default is 1.5).
        :param upper_threshold: float - Upper threshold for the IQR method (default is 1.5).
        :return: pd.DataFrame - DataFrame with imputed outliers.
        """
        if isinstance(columns, str):
            columns = [columns]
        
        for column in columns:
            if column in df.columns:
                Q1 = df[column].quantile(0.25)
                Q3 = df[column].quantile(0.75)
                IQR = Q3 - Q1
                
                lower_bound = Q1 - lower_threshold * IQR
                upper_bound = Q3 + upper_threshold * IQR
                
                median = df[column].median()
                
                df[column] = df[column].apply(
                    lambda x: median if x < lower_bound or x > upper_bound else x
                )
            else:
                raise ValueError(f"Column {column} not found in the DataFrame.")

    impute_outliers(df_train, 'Fare')
    impute_outliers(df_test, 'Fare')

## Types

In [46]:
with safe_execution():
    def set_dtypes(df, _dtypes):
        for column, _dtype in _dtypes.items():
            df[column] = df[column].astype(_dtype)

    _dtypes_train['Age'] = int
    _dtypes_train['Name'] = int
    _dtypes_train['Sex'] = int
    _dtypes_train['Ticket'] = int
    _dtypes_train['Cabin'] = int
    _dtypes_train['Embarked'] = int

    set_dtypes(df_train, _dtypes_train)

    _dtypes_test['Age'] = int
    _dtypes_test['Name'] = int
    _dtypes_test['Sex'] = int
    _dtypes_test['Ticket'] = int
    _dtypes_test['Cabin'] = int
    _dtypes_test['Embarked'] = int

    set_dtypes(df_test, _dtypes_test)

## Format

In [47]:
with safe_execution():
    df_train.Fare = df_train.Fare.round(2)
    df_test.Fare = df_test.Fare.round(2)

# Load

In [48]:
with safe_execution():
    os.makedirs(os.path.expanduser("\\".join(pc.get_path("titanic_frame_train").split('\\')[:-1])), exist_ok=True)

    df_train.to_parquet(pc.get_path("titanic_frame_train"), engine='pyarrow')
    df_test.to_parquet(pc.get_path("titanic_frame_test"), engine='pyarrow')