## Apuntes team challenge 

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [4]:
df = pd.read_csv("./data/titanic.csv")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [2]:


def get_features_num_regression(df: pd.DataFrame, target_col: str, umbral_corr: float, pvalue: float = None):
    # Comprobaciones de entrada
    
    # Verificar que 'df' es un DataFrame de pandas
    if not isinstance(df, pd.DataFrame):
        print("El argumento 'df' debe ser un DataFrame de pandas.")
        return None

    # Verificar que 'target_col' sea una columna válida en el DataFrame
    if target_col not in df.columns:
        print(f"La columna {target_col} no existe en el DataFrame.")
        return None
    
    # Verificar que 'target_col' sea numérica
    if not pd.api.types.is_numeric_dtype(df[target_col]):
        print(f"La columna {target_col} no es numérica.")
        return None
    
    # Verificar que 'umbral_corr' esté entre 0 y 1
    if not (0 <= umbral_corr <= 1):
        print("El valor de 'umbral_corr' debe estar entre 0 y 1.")
        return None
    
    # Verificar que si 'pvalue' es proporcionado, sea un valor entre 0 y 1
    if pvalue is not None and (not isinstance(pvalue, (int, float)) or not (0 <= pvalue <= 1)):
        print("El valor de 'pvalue' debe estar entre 0 y 1 o ser None.")
        return None
    
    # Filtrar solo las columnas numéricas del dataframe
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    # Calcular la correlación entre las columnas numéricas y la columna objetivo
    corr_with_target = df[numeric_cols].corr()[target_col].abs()
    
    # Filtrar columnas por correlación mayor que el umbral
    filtered_cols = corr_with_target[corr_with_target > umbral_corr].index.tolist()
    
    # Si se proporcionó un p-value, realizar el test de hipótesis
    if pvalue is not None:
        significant_cols = []
        for col in filtered_cols:
            # Realizar el test de hipótesis (test t de Pearson)
            corr_value, p_value_corr = stats.pearsonr(df[target_col], df[col])
            
            # Filtrar por el p-value
            if p_value_corr <= pvalue:
                significant_cols.append(col)
        return significant_cols
    
    return filtered_cols


In [8]:
# Llamar a la función
result = get_features_num_regression(df, target_col='survived', umbral_corr=0.3, pvalue=0.05)
print(result)

['survived', 'pclass']
