In [5]:
import pandas as pd

# Handling Duplicates

Duplicate values oftenmost arise from the necessary act of joining and consolidating data from various resources.

In [6]:
sales = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/sales_sample.csv")
sales # last 3 rows are the same

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.728,0.679,8.106
1,2,A,1,2010-03-05,21827.9,False,8.056,0.693,8.106
2,3,B,1,2010-04-02,57258.43,True,16.817,0.718,7.808
3,4,B,2,2010-05-07,17413.94,False,22.528,0.749,7.808
4,3,B,2,2010-06-04,17558.09,False,27.05,0.715,7.808
5,6,C,2,2010-06-04,17558.09,True,27.05,0.715,7.808
6,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808
7,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808
8,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808


## How to find duplicate rows?

→ `.duplicated() method`

**Arguments:**

-	`'subset'`: list of columns that you want to know whether there are rows have the same values of these attributes 
		
	-	default: list of all columns → returns True for the entirely duplicated rows

-	`'keep'`: Whether to keep the **first** (`keep = "first"`), **last** (`keep = "last"`) duplicate, or **no one of them** (`keep = False`)

In [7]:
# returns True for the entirely duplicated rows
print(sales.duplicated(keep='first'))
print(sales.duplicated(keep='last'))
print(sales.duplicated(keep=False))

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
dtype: bool
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8    False
dtype: bool
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
dtype: bool


In [8]:
print(sales.duplicated(subset=["type", "department"], keep='first'))

0    False
1     True
2    False
3    False
4     True
5    False
6    False
7     True
8     True
dtype: bool


## Dropping Duplicates

→ `.drop_duplicates() method`

**Arguments:**

-	`'subset'`: list of columns that you will consider a row as duplicated based on them  
		
	-	default: list of all columns → drops the entirely duplicated rows

-	`'keep'`: Whether to keep the **first** (`keep = "first"`), **last** (`keep = "last"`) duplicate, or **no one of them** (`keep = False`)

	-	default: `keep = "first"`

-	`'inplace'`

In [9]:
unique_sales = sales.drop_duplicates() # drop entirely duplicated rows
unique_sales

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.728,0.679,8.106
1,2,A,1,2010-03-05,21827.9,False,8.056,0.693,8.106
2,3,B,1,2010-04-02,57258.43,True,16.817,0.718,7.808
3,4,B,2,2010-05-07,17413.94,False,22.528,0.749,7.808
4,3,B,2,2010-06-04,17558.09,False,27.05,0.715,7.808
5,6,C,2,2010-06-04,17558.09,True,27.05,0.715,7.808
6,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808


In [10]:
# keep only rows with unique stores (there is a problem here: there may be two different sales with the same store)
# => we need to consider more than just "store" id when dropping duplicates
unique_store = sales.drop_duplicates(subset=["store"]) # OR subset="store"
unique_store

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.728,0.679,8.106
1,2,A,1,2010-03-05,21827.9,False,8.056,0.693,8.106
2,3,B,1,2010-04-02,57258.43,True,16.817,0.718,7.808
3,4,B,2,2010-05-07,17413.94,False,22.528,0.749,7.808
5,6,C,2,2010-06-04,17558.09,True,27.05,0.715,7.808
6,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808


In [11]:
# A possible solution to the issue in the previous cell is to drop based on different columns
unique_type_dep = sales.drop_duplicates(subset=["type", "department"], keep='first') # ↓↓
# When you find two or more rows with the same type and department, keep only the 1st one of them
unique_type_dep

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.728,0.679,8.106
2,3,B,1,2010-04-02,57258.43,True,16.817,0.718,7.808
3,4,B,2,2010-05-07,17413.94,False,22.528,0.749,7.808
5,6,C,2,2010-06-04,17558.09,True,27.05,0.715,7.808
6,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808


### Combining duplicated rows into one row 

Instead of just keeping one record of the duplicates, we can combine them into one by computing the average mean between them, the maximum, or other statistical measures. This is highly dependent on a common sense understanding of our data, and what type of data we have.

We can do this easily using the `groupby` method, which when chained with the `agg` method, lets you group by a set of common columns and return statistical values for specific columns when the aggregation is being performed.

In [12]:
unique_type_dep = sales.groupby(["type", "department"], as_index=False)\
						.agg({"weekly_sales": "mean", "temperature_c": "mean", "fuel_price_usd_per_l": "mean", "unemployment": "mean",\
			 				"store": "max", "date": "max", "is_holiday": "sum"}) # .reset_index() = as_index=False
unique_type_dep

Unnamed: 0,type,department,weekly_sales,temperature_c,fuel_price_usd_per_l,unemployment,store,date,is_holiday
0,A,1,23376.2,6.892,0.686,8.106,2,2010-03-05,0
1,B,1,57258.43,16.817,0.718,7.808,3,2010-04-02,1
2,B,2,17486.015,24.789,0.732,7.808,4,2010-06-04,0
3,C,2,17558.09,27.05,0.715,7.808,6,2010-06-04,1
4,C,3,17558.09,27.05,0.715,7.808,7,2010-06-04,0
