In [8]:
import pandas as pd

# Definir un DataFrame de ejemplo

data = pd.read_csv('housing.csv', usecols = ['total_rooms','total_bedrooms','median_income'])

df = pd.DataFrame(data)

def outliers(data, method='z-score', threshold=3):
    """
    Elimina los valores atípicos de un conjunto de datos.
    
    Parámetros:
        - data: DataFrame de pandas con los datos.
        - method: Método para detectar los valores atípicos. Puede ser 'z-score' o 'boxplot'.
        - threshold: Umbral para considerar un valor como atípico. Por defecto, es 3.
    
    Retorna:
        - DataFrame: DataFrame sin los valores atípicos.
    """
    if method == 'z-score':
        z_scores = ((data - data.mean()) / data.std()).abs()
        return data[(z_scores < threshold).all(axis=1)]
    elif method == 'boxplot':
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - threshold * iqr
        upper_bound = q3 + threshold * iqr
        return data[((data >= lower_bound) & (data <= upper_bound)).all(axis=1)]
    else:
        raise ValueError("Método no válido. Use 'z-score' o 'boxplot'.")

# Ejemplo de uso:
# Eliminar outliers utilizando z-score
df_cleaned_zscore = outliers(df, method='z-score')
print("Datos sin outliers (z-score):\n", df_cleaned_zscore)

# Eliminar outliers utilizando boxplot
df_cleaned_boxplot = outliers(df, method='boxplot')
print("\nDatos sin outliers (boxplot):\n", df_cleaned_boxplot)


Datos sin outliers (z-score):
        total_rooms  total_bedrooms  median_income
0            880.0           129.0         8.3252
1           7099.0          1106.0         8.3014
2           1467.0           190.0         7.2574
3           1274.0           235.0         5.6431
4           1627.0           280.0         3.8462
...            ...             ...            ...
20635       1665.0           374.0         1.5603
20636        697.0           150.0         2.5568
20637       2254.0           485.0         1.7000
20638       1860.0           409.0         1.8672
20639       2785.0           616.0         2.3886

[19634 rows x 3 columns]

Datos sin outliers (boxplot):
        total_rooms  total_bedrooms  median_income
0            880.0           129.0         8.3252
1           7099.0          1106.0         8.3014
2           1467.0           190.0         7.2574
3           1274.0           235.0         5.6431
4           1627.0           280.0         3.8462
...        

In [9]:
data

Unnamed: 0,total_rooms,total_bedrooms,median_income
0,880.0,129.0,8.3252
1,7099.0,1106.0,8.3014
2,1467.0,190.0,7.2574
3,1274.0,235.0,5.6431
4,1627.0,280.0,3.8462
...,...,...,...
20635,1665.0,374.0,1.5603
20636,697.0,150.0,2.5568
20637,2254.0,485.0,1.7000
20638,1860.0,409.0,1.8672


In [16]:
maximos = data[['total_rooms', 'total_bedrooms', 'median_income']].max()

In [None]:
maximos

In [None]:
maximos = df_cleaned_zscore[['total_rooms', 'total_bedrooms', 'median_income']].max()

In [None]:
maximos

In [14]:
maximos = df_cleaned_boxplot[['total_rooms', 'total_bedrooms', 'median_income']].max()

In [15]:
maximos

total_rooms       8244.0000
total_bedrooms    1699.0000
median_income       11.2463
dtype: float64