<a href="https://colab.research.google.com/github/Mondin0/data-eng/blob/main/CeL_Data_Eng_Procesamiento_Duplicados.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Manejo de duplicados
*En Pandas*

Es normal encontrarnos con conjuntos de datos que contienen registros repetidos debido a errores en la recolección de datos, errores en la entrada de datos, etc. Por ello, es fundamental anticiparse a este tipo de situaciones y saber cómo manejarlas.
Concretamente, vamos a ver cómo identificar y eliminar duplicados con la herramienta Pandas.

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(
    {
        "Brand": [
            "Heineken",
            "Corona",
            "Budweiser",
            "Stella Artois",
            "Guinness",
            "Heineken",
            "Corona",
            "Budweiser",
            "Stella Artois",
            "Guinness",
        ],
        "Type": [
            "Pilsner",
            "Pale Lager",
            "Pale Lager",
            "Pilsner",
            "Stout",
            "Pilsner",
            "Pale Lager",
            "Pale Lager",
            "Pilsner",
            "Stout",
        ],
        "ABV": [5.0, 4.5, 5.0, 5.2, 4.2, 5.0, 4.5, 5.0, 5.2, 4.2],
        "Rating": [4.1, 3.9, 4.2, 4.3, 4.5, 4.1, 3.9, 4.2, 4.3, 4.5],
    }
)

# Ordenamos registros para observar los duplicados
df = df.sort_values(by=["Brand", "Type"])
print(df)

print(f"\nFilas: {df.shape[0]}. Columnas: {df.shape[1]}")

           Brand        Type  ABV  Rating
2      Budweiser  Pale Lager  5.0     4.2
7      Budweiser  Pale Lager  5.0     4.2
1         Corona  Pale Lager  4.5     3.9
6         Corona  Pale Lager  4.5     3.9
4       Guinness       Stout  4.2     4.5
9       Guinness       Stout  4.2     4.5
0       Heineken     Pilsner  5.0     4.1
5       Heineken     Pilsner  5.0     4.1
3  Stella Artois     Pilsner  5.2     4.3
8  Stella Artois     Pilsner  5.2     4.3

Filas: 10. Columnas: 4


In [None]:
# Vamos a borrar duplicados, considerando todas las columnas
df_deduplicated = df.drop_duplicates().copy()
df_deduplicated

Unnamed: 0,Brand,Type,ABV,Rating
2,Budweiser,Pale Lager,5.0,4.2
1,Corona,Pale Lager,4.5,3.9
4,Guinness,Stout,4.2,4.5
0,Heineken,Pilsner,5.0,4.1
3,Stella Artois,Pilsner,5.2,4.3


In [None]:
print(f"La cantidad de filas es {df_deduplicated.shape[0]}")

La cantidad de filas es 5


In [None]:
# Una forma de identificar duplicados es por medio de un identificador único

# Primero creamos una columna ID sobre el DataFrame original
df["ID"] = [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
df

Unnamed: 0,Brand,Type,ABV,Rating,ID
2,Budweiser,Pale Lager,5.0,4.2,1
7,Budweiser,Pale Lager,5.0,4.2,1
1,Corona,Pale Lager,4.5,3.9,2
6,Corona,Pale Lager,4.5,3.9,2
4,Guinness,Stout,4.2,4.5,3
9,Guinness,Stout,4.2,4.5,3
0,Heineken,Pilsner,5.0,4.1,4
5,Heineken,Pilsner,5.0,4.1,4
3,Stella Artois,Pilsner,5.2,4.3,5
8,Stella Artois,Pilsner,5.2,4.3,5


In [None]:
df_deduplicated = df.drop_duplicates(subset="ID",
                                     keep="first")
df_deduplicated

Unnamed: 0,Brand,Type,ABV,Rating,ID
2,Budweiser,Pale Lager,5.0,4.2,1
1,Corona,Pale Lager,4.5,3.9,2
4,Guinness,Stout,4.2,4.5,3
0,Heineken,Pilsner,5.0,4.1,4
3,Stella Artois,Pilsner,5.2,4.3,5


In [None]:
df_deduplicated = df.drop_duplicates(subset=["Brand", "Type"],
                                     keep="first")
df_deduplicated

Unnamed: 0,Brand,Type,ABV,Rating,ID
2,Budweiser,Pale Lager,5.0,4.2,1
1,Corona,Pale Lager,4.5,3.9,2
4,Guinness,Stout,4.2,4.5,3
0,Heineken,Pilsner,5.0,4.1,4
3,Stella Artois,Pilsner,5.2,4.3,5


In [None]:
df_deduplicated = df.drop_duplicates(subset=["Brand", "Type"],
                                     keep=False)
df_deduplicated

Unnamed: 0,Brand,Type,ABV,Rating,ID


In [None]:
df = pd.DataFrame({
    'Brand': ['Heineken', 'Corona', 'Budweiser', 'Stella Artois', 'Guinness'] * 3,
    'Type': ['Pilsner', 'Pale Lager', 'Pale Lager', 'Pilsner', 'Stout'] * 3,
    'ID': [1, 2, 3, 4, 5] * 3,
    'Price': [4.1, 3.9, 4.2, 4.3, 4.5, 4.2, 4.1, 4.3, 4.4, 4.6, 4.3, 4.2, 4.4, 4.5, 4.7],
    'Date_price': ['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05', '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05']
})

df = df.sort_values(by=["ID", "Date_price"], ascending=[False, True])
df

Unnamed: 0,Brand,Type,ID,Price,Date_price
4,Guinness,Stout,5,4.5,2021-01-05
9,Guinness,Stout,5,4.6,2021-02-05
14,Guinness,Stout,5,4.7,2021-03-05
3,Stella Artois,Pilsner,4,4.3,2021-01-04
8,Stella Artois,Pilsner,4,4.4,2021-02-04
13,Stella Artois,Pilsner,4,4.5,2021-03-04
2,Budweiser,Pale Lager,3,4.2,2021-01-03
7,Budweiser,Pale Lager,3,4.3,2021-02-03
12,Budweiser,Pale Lager,3,4.4,2021-03-03
1,Corona,Pale Lager,2,3.9,2021-01-02


In [None]:
df_deduplicated = df.drop_duplicates(subset=["ID"], keep="last")
df_deduplicated

Unnamed: 0,Brand,Type,ID,Price,Date_price
14,Guinness,Stout,5,4.7,2021-03-05
13,Stella Artois,Pilsner,4,4.5,2021-03-04
12,Budweiser,Pale Lager,3,4.4,2021-03-03
11,Corona,Pale Lager,2,4.2,2021-03-02
10,Heineken,Pilsner,1,4.3,2021-03-01


In [None]:
(
    df
    .sort_values(
        by=["ID", "Date_price"], ascending=[False, False]
        )
    .drop_duplicates(subset=["ID"], keep="first")
    )

Unnamed: 0,Brand,Type,ID,Price,Date_price
14,Guinness,Stout,5,4.7,2021-03-05
13,Stella Artois,Pilsner,4,4.5,2021-03-04
12,Budweiser,Pale Lager,3,4.4,2021-03-03
11,Corona,Pale Lager,2,4.2,2021-03-02
10,Heineken,Pilsner,1,4.3,2021-03-01
