In [3]:
import pandas as pd

In [5]:
# cargamos el archivo
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
column_names = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", 
                "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", 
                "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", 
                "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", 
                "highway-mpg", "price"]

df = pd.read_csv(url, names=column_names, na_values="?")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


### REEMPLAZAMOS LOS DATOS FALTANTES

In [9]:
# reemplazamos los valores nulos en las columnas numericas con la media de cada columna
df.fillna(df.mean(numeric_only=True), inplace=True)

# Reemplazar valores nulos en columnas categóricas con el valor más frecuente
for column in df.select_dtypes(include='object').columns:
    df[column] = df[column].fillna(df[column].mode()[0])

# Verificar si quedan datos faltantes
df.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

### Detectamos los datos duplicados

In [10]:
# Detectar datos duplicados
duplicated_rows = df[df.duplicated()]

# Mostrar las filas duplicadas (si existen)
duplicated_rows

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price


In [11]:
# Contar la cantidad de filas duplicadas
num_duplicated = df.duplicated().sum()
print(f"Número de filas duplicadas: {num_duplicated}")

Número de filas duplicadas: 0


### Removemos los datos duplicados

In [12]:
# Eliminar las filas duplicadas
df.drop_duplicates(inplace=True)

### Eliminamos las filas con valores nulos en una columna especifica

In [13]:
# Eliminar filas con valores nulos en la columna 'price'
df.dropna(subset=['price'], inplace=True)

In [14]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
