<a href="https://colab.research.google.com/github/KARENCMP82/Python/blob/main/Scikit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import os  # ¡Asegúrate de que esta línea esté presente y descomentada!

try:
    from feature_engine.encoding import StringSimilarityEncoder
except ModuleNotFoundError:  # Usa ModuleNotFoundError en lugar de except sin tipo
    # comentario: Si falla la importación de StringSimilarityEncoder desde feature_engine,
    # intenta instalar la versión 1.7.0 de feature_engine usando pip.
    os.system("pip install feature_engine==1.7.0")


In [12]:
import time  # Librería estándar: para medir tiempos de ejecución

import warnings  # Librería estándar: para gestionar avisos (warnings)
warnings.simplefilter(action = 'ignore')

import os  # Librería estándar: para interactuar con el sistema operativo

import numpy as np  # Librería de terceros: para computación numérica
import pandas as pd  # Librería de terceros: para manipulación y análisis de datos

import sklearn  # Librería de terceros: para machine learning
from sklearn import set_config

from sklearn.tree import DecisionTreeClassifier  # Módulo de sklearn: Árboles de decisión

# transformers (Módulos de sklearn para preprocesamiento)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

# pipelines (Módulos de sklearn para construir flujos de trabajo)
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score  # Módulo de sklearn: Métricas de evaluación

# import xgboost # Librería de terceros: Algoritmos de Gradient Boosting (comentada, no usada aún)

In [13]:
print("Working with these versions of libraries\n")
print("-"*50)
print(f"sklearn version {sklearn.__version__}")

Working with these versions of libraries

--------------------------------------------------
sklearn version 1.6.1


In [14]:
set_config(transform_output = "pandas")
# comentario: Configura scikit-learn para que las transformaciones (como StandardScaler, OneHotEncoder, etc.)
# devuelvan DataFrames de pandas en lugar de arrays de NumPy.  Esto facilita el manejo y la inspección de los datos
# en las etapas de preprocesamiento dentro de los pipelines.

In [15]:

#Conexion
from google.colab import drive
drive.mount('/content/drive')
#leemos el fichero IMDB
#data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Ejercicios/pd_sklearn_data.csv")
#prod=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Ejercicios/pd_sklearn_prod.csv")
# Cargar archivo CSV como DataFrame ver los 5 primero registros
#data.head(20)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# CWD es la ruta desde donde estamos ejecutando el notebook
# para los alumnos que trabajan en "local", se detecta automáticamente

CWD = os.getcwd()
DATA_PATH = os.path.join(CWD, "data")

# Crea el directorio si no existe
os.makedirs(DATA_PATH, exist_ok=True)  # Crea el directorio y sus padres si no existen

print(CWD)
print(DATA_PATH)
print(os.listdir(DATA_PATH))

/content
/content/data
[]


In [17]:
# Para las personas que trabajan en Colab
# modificad el DATA_PATH

DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/Modulo4"

In [18]:
print(os.listdir(DATA_PATH))

['pd_sklearn_prod.csv', 'pd_sklearn_data.csv']


In [19]:
'''
El DataLoader es una clase auxiliar que va a cargar nuestros datasets
Le suministramos la ruta del train/X y del test/dataset producción (prod) y nos devuelve los dos datasets
También le tenemos que pasar la columna del target y del index
Para que el objeto DataLoader distinga estas dos columnas
'''

class DataLoader(object):
    '''
    DataLoader helps you import you train and test data and do some basic preprocessing on them.
    '''
    def __init__(self, train_path, test_path, train_columns, target, index):
        '''
        Constructor for the class.
        Needs the train and test path and train_columns (features), index and target column.
        '''
        self.train_path = train_path
        self.test_path = test_path

        self.train_columns = train_columns
        self.target = target
        self.index = index

    def _process_df(self, df):
        '''
        Converts the columns to upper, sets index and splits between X and y.
        '''
        df.columns = map(str.upper, df.columns)
        df.set_index(self.index, inplace = True)

        if self.target in df.columns:

            y = df[self.target]
            df.drop(self.target, axis = 1, inplace = True)
            df = df[self.train_columns]

            return df, y

        else:

            df = df[self.train_columns]

            return df, None

    def load_data(self):
        '''
        Loads the data and calls _process_df to X_train and X_test.
        '''
        X_train, y_train = self._process_df(pd.read_csv(self.train_path))

        X_test, _ = self._process_df(pd.read_csv(self.test_path))

        return X_train, y_train, X_test

'''
DataFrameReporter nos va a permitir hacer nuestro primer contacto con el dataset.

Para las variables númericas hará un describe y contará los nulos que hay.
Y para las variables categóricas hará un count de los nulos y nos dirá también
el número de categorías únicas que hay.

Al final se trata de una clase auxiliar que hará una parte del EDA.
'''

class DataFrameReporter(object):
    '''
    Helper class that reports nulls and datatypes of train and test data
    '''
    def __init__(self, X_train, X_test, target_column):
        '''
        Constructor for the class.
        Needs train and test data and also the target column in train.
        '''
        self.X_train = X_train
        self.X_test = X_test
        self.target_column = target_column

    def analyze_X(self, X):
        '''
        Analyses the DataFrame you pass and returns a report of nulls, distribution and other goodies.
        '''

        if self.target_column in X.columns:
            X = X.drop(self.target_column, axis = 1)

        dtypes = X.dtypes.to_frame().rename(columns = {0:"Dtypes"})

        nulls_in_train = X.isnull().sum().to_frame().rename(columns = {0:"Absolute_nulls"})
        nulls_in_train["Relative_nulls"] = nulls_in_train["Absolute_nulls"]/X.shape[0]
        nulls_in_train["Relative_nulls"] = nulls_in_train["Relative_nulls"].apply(
            lambda number: round(number, 3) * 100
        )

        nulls_in_train = pd.concat([nulls_in_train, dtypes], axis = 1)
        nulls_in_train["Shape"] = X.shape[0]
        nulls_in_train = nulls_in_train[["Dtypes", "Shape", "Absolute_nulls", "Relative_nulls"]]

        describe_values_num = X.describe().T
        report_df = pd.concat([nulls_in_train, describe_values_num], axis = 1)

        object_columns = X.select_dtypes("object").columns
        unique_categories = {col:X[col].nunique() for col in object_columns}
        unique_cat_df = pd.DataFrame(
            data = unique_categories.values(),
            index = unique_categories.keys(),
            columns = ["Unique_category"]
        )

        report_df = pd.concat([report_df, unique_cat_df], axis = 1)

        report_df.fillna("", inplace = True)
        report_df.sort_values("Dtypes", ascending = True, inplace = True)

        return report_df

    def get_reports(self):
        '''
        Calls analyze_X method and returns report DataFrame for train and test.
        '''
        report_train = self.analyze_X(X = self.X_train)
        report_test = self.analyze_X(X = self.X_test)

        report_train["Origin"] = "X"
        report_test["Origin"] = "Prod"

        result = pd.concat([report_train, report_test])

        return result

In [20]:
# Definimos nuestras variables globales
# En este caso, vamos a usar unas pocas columnas del dataframe a modo de ejemplo
# Es buena praxis definir este tipo de variables en un fichero json o bien al comienzo de un
# notebook/script

TRAIN_PATH = os.path.join(DATA_PATH, "pd_sklearn_data.csv")
PRODUCTION_PATH = os.path.join(DATA_PATH, "pd_sklearn_prod.csv")

TARGET = "SURVIVED"
INDEX = "PASSENGERID"

TRAIN_COLUMNS = ['PCLASS', 'AGE', 'SIBSP', 'EMBARKED', "SEX", "FARE"]

# Cargamos nuestros datasets con la clase auxiliar

X, y, X_prod = DataLoader(
    train_path = TRAIN_PATH,
    test_path = PRODUCTION_PATH,
    train_columns = TRAIN_COLUMNS,
    target = TARGET,
    index = INDEX
).load_data()

In [21]:
# separamos nuestro dataset entre columnas numericas y object

numeric_columns = X.select_dtypes(include = np.number).columns.tolist()
object_columns = X.select_dtypes(exclude = np.number).columns.tolist()

assert (len(numeric_columns) + len(object_columns)) == X.shape[1], "You have missed some columns"

In [22]:
print("Working with numeric columns: ", ", ".join(numeric_columns))
print("Working with categorical columns: ", ", ".join(object_columns))

Working with numeric columns:  PCLASS, AGE, SIBSP, FARE
Working with categorical columns:  EMBARKED, SEX


In [23]:
# Hacemos nuestro análisis rápido de los datasets
report = DataFrameReporter(
    pd.concat([X, y], axis = 1), X_prod, TARGET
).get_reports()

In [24]:
report

Unnamed: 0,Dtypes,Shape,Absolute_nulls,Relative_nulls,count,mean,std,min,25%,50%,75%,max,Unique_category,Origin
PCLASS,int64,891,0,0.0,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0,,X
SIBSP,int64,891,0,0.0,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0,,X
AGE,float64,891,177,19.9,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0,,X
FARE,float64,891,0,0.0,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292,,X
EMBARKED,object,891,2,0.2,,,,,,,,,3.0,X
SEX,object,891,0,0.0,,,,,,,,,2.0,X
PCLASS,int64,418,0,0.0,418.0,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0,,Prod
SIBSP,int64,418,0,0.0,418.0,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0,,Prod
AGE,float64,418,86,20.6,332.0,30.27259,14.181209,0.17,21.0,27.0,39.0,76.0,,Prod
FARE,float64,418,1,0.2,417.0,35.627188,55.907576,0.0,7.8958,14.4542,31.5,512.3292,,Prod


# Scikit-Learn style


In [25]:
X.head(1) # Train

Unnamed: 0_level_0,PCLASS,AGE,SIBSP,EMBARKED,SEX,FARE
PASSENGERID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,22.0,1,S,male,7.25


In [26]:
imputer=SimpleImputer(strategy="most_frequent")

In [27]:
imputer

In [28]:
imputer.fit(X)

In [29]:
imputer = SimpleImputer(strategy = "median")

In [30]:
X["EMBARKED"].mode()

Unnamed: 0,EMBARKED
0,S
