In [8]:
import pandas as pd
from datetime import datetime

In [9]:
# Sample DataFrame with duplicate rows
data = {
    'customer_id': [1, 2, 3, 4, 5, 1, None, 2, 3, 1],
    'product': ['A', 'B', 'C', 'A', None, 'A', 'C', 'B', 'C', 'A'],
    'quantity': [2, 1, 5, 3, 2, None, 1, 1, 5, 2],
    'price': [10.5, 20.0, 7.5, 10.5, 20.0, 10.5, None, 20.0, 7.5, 10.5],
    'purchase_date': [
        '2025-04-01',
        '2025-04-02', 
        '2025-04-02',
        '2025-04-03',
        '2025-04-04',
        None,
        '2025-04-05',
        '2025-04-02',
        '2025-04-02',
        '2025-04-01'
    ],
    'ship_date': [
        datetime(2025, 4, 2),
        datetime(2025, 4, 3),
        datetime(2025, 4, 3),
        datetime(2025, 4, 4),
        datetime(2025, 4, 7),
        datetime(2025, 4, 6),
        None,
        datetime(2025, 4, 3),
        datetime(2025, 4, 3),
        datetime(2025, 4, 2)
    ],
    'region': ['North', 'South', 'East', 'North', 'South', 'North', 'East', 'South', 'East', 'North']
}

df = pd.DataFrame(data)

# 1. Drop duplicate rows

In [10]:
print(df)

   customer_id product  quantity  price purchase_date  ship_date region
0          1.0       A       2.0   10.5    2025-04-01 2025-04-02  North
1          2.0       B       1.0   20.0    2025-04-02 2025-04-03  South
2          3.0       C       5.0    7.5    2025-04-02 2025-04-03   East
3          4.0       A       3.0   10.5    2025-04-03 2025-04-04  North
4          5.0    None       2.0   20.0    2025-04-04 2025-04-07  South
5          1.0       A       NaN   10.5          None 2025-04-06  North
6          NaN       C       1.0    NaN    2025-04-05        NaT   East
7          2.0       B       1.0   20.0    2025-04-02 2025-04-03  South
8          3.0       C       5.0    7.5    2025-04-02 2025-04-03   East
9          1.0       A       2.0   10.5    2025-04-01 2025-04-02  North


In [11]:
df.drop_duplicates()

Unnamed: 0,customer_id,product,quantity,price,purchase_date,ship_date,region
0,1.0,A,2.0,10.5,2025-04-01,2025-04-02,North
1,2.0,B,1.0,20.0,2025-04-02,2025-04-03,South
2,3.0,C,5.0,7.5,2025-04-02,2025-04-03,East
3,4.0,A,3.0,10.5,2025-04-03,2025-04-04,North
4,5.0,,2.0,20.0,2025-04-04,2025-04-07,South
5,1.0,A,,10.5,,2025-04-06,North
6,,C,1.0,,2025-04-05,NaT,East


# 2. Drop duplicates based on specific columns

In [12]:
df.drop_duplicates(subset=['customer_id', 'product'])

Unnamed: 0,customer_id,product,quantity,price,purchase_date,ship_date,region
0,1.0,A,2.0,10.5,2025-04-01,2025-04-02,North
1,2.0,B,1.0,20.0,2025-04-02,2025-04-03,South
2,3.0,C,5.0,7.5,2025-04-02,2025-04-03,East
3,4.0,A,3.0,10.5,2025-04-03,2025-04-04,North
4,5.0,,2.0,20.0,2025-04-04,2025-04-07,South
6,,C,1.0,,2025-04-05,NaT,East


# 3. Keep the last occurrence of duplicates

In [13]:
df.drop_duplicates(keep='last')

Unnamed: 0,customer_id,product,quantity,price,purchase_date,ship_date,region
3,4.0,A,3.0,10.5,2025-04-03,2025-04-04,North
4,5.0,,2.0,20.0,2025-04-04,2025-04-07,South
5,1.0,A,,10.5,,2025-04-06,North
6,,C,1.0,,2025-04-05,NaT,East
7,2.0,B,1.0,20.0,2025-04-02,2025-04-03,South
8,3.0,C,5.0,7.5,2025-04-02,2025-04-03,East
9,1.0,A,2.0,10.5,2025-04-01,2025-04-02,North


# 4. Keep no duplicates at all

In [14]:
df[df.duplicated() == False]

Unnamed: 0,customer_id,product,quantity,price,purchase_date,ship_date,region
0,1.0,A,2.0,10.5,2025-04-01,2025-04-02,North
1,2.0,B,1.0,20.0,2025-04-02,2025-04-03,South
2,3.0,C,5.0,7.5,2025-04-02,2025-04-03,East
3,4.0,A,3.0,10.5,2025-04-03,2025-04-04,North
4,5.0,,2.0,20.0,2025-04-04,2025-04-07,South
5,1.0,A,,10.5,,2025-04-06,North
6,,C,1.0,,2025-04-05,NaT,East


# 5. Get only duplicates rows

In [15]:
df[df.duplicated()]

Unnamed: 0,customer_id,product,quantity,price,purchase_date,ship_date,region
7,2.0,B,1.0,20.0,2025-04-02,2025-04-03,South
8,3.0,C,5.0,7.5,2025-04-02,2025-04-03,East
9,1.0,A,2.0,10.5,2025-04-01,2025-04-02,North


# 6. Mark duplicates with a boolean

In [16]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
9     True
dtype: bool