<a href="https://colab.research.google.com/github/Minayaterry/lab02-mt/blob/main/LAB04_Limpieza_Transformacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LABORATORIO: INTEGRACIÓN, LIMPIEZA Y TRANSFORMACIÓN DE DATOS
**Nombre:** TERRY MINAYA

**Nombre:** Ronald Tuncar

In [None]:
#  Importación de librerías necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from scipy.spatial.distance import mahalanobis

In [None]:
# 📥 Cargar la base de datos desde UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"

# Asignar nombres a las columnas
columnas = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
            'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei',
            'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

# Leer los datos
df = pd.read_csv(url, names=columnas)

# 🔍 Ver primeros datos
df.head()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [None]:
# Reemplazar los '?' con NaN
df.replace('?', np.nan, inplace=True)

# Convertir la columna Bare Nuclei a numérico
df['Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'])

# Imputar valores faltantes con la mediana
df['Bare Nuclei'].fillna(df['Bare Nuclei'].median(), inplace=True)

# Cambiar valores de la columna Class: 2 -> 0 (benigno), 4 -> 1 (maligno)
df['Class'] = df['Class'].map({2: 0, 4: 1})

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bare Nuclei'].fillna(df['Bare Nuclei'].median(), inplace=True)


Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0
3,1016277,6,8,8,1,3,4.0,3,7,1,0
4,1017023,4,1,1,3,2,1.0,3,1,1,0


# **b. Realice una detección de valores atípicos univariados por medio del método del rango intercuartílico con 3 de longitud a la derecha y 3 a la izquierda, y elimínelos. Además, realice una detección de valores atípicos multivariados por medio de las distancias de Mahalanobis y elimine aquellos valores que superen el valor de 30.**

In [None]:
# Función para eliminar outliers univariados usando IQR
def eliminar_outliers_iqr(df, k=3):
    df_out = df.copy()
    for col in df_out.select_dtypes(include=np.number).columns:
        Q1 = df_out[col].quantile(0.25)
        Q3 = df_out[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR
        df_out = df_out[(df_out[col] >= lower_bound) & (df_out[col] <= upper_bound)]
    return df_out

df_iqr = eliminar_outliers_iqr(df)
df_iqr.shape

(443, 11)

In [None]:
# Función para calcular distancias de Mahalanobis
def mahalanobis_outliers(df, threshold=30):
    df_numeric = df.select_dtypes(include=np.number).drop(columns=['ID', 'Class'])
    cov_matrix = np.cov(df_numeric.T)
    inv_cov_matrix = np.linalg.pinv(cov_matrix)
    mean_d = df_numeric.mean().values
    distances = df_numeric.apply(lambda row: mahalanobis(row, mean_d, inv_cov_matrix), axis=1)
    df['mahalanobis'] = distances
    return df[df['mahalanobis'] < threshold].drop(columns='mahalanobis')

df_maha = mahalanobis_outliers(df_iqr)
df_maha.shape

(443, 11)