# Handling duplicates in pandas

In [1]:
import pandas as pd

In [2]:
data = pd.DataFrame({'Column A':['one']*4 +['two']*4,
                     'Column B':[1,1,1,2,3,3,4,4]})
data

Unnamed: 0,Column A,Column B
0,one,1
1,one,1
2,one,1
3,one,2
4,two,3
5,two,3
6,two,4
7,two,4


In [3]:
data.duplicated()

0    False
1     True
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [4]:
#only check for duplicates in column B
data.duplicated(subset='Column B')

0    False
1     True
2     True
3    False
4    False
5     True
6    False
7     True
dtype: bool

In [5]:
#number of duplicate rows
data.duplicated().sum()

4

In [6]:
data[data.duplicated()]

Unnamed: 0,Column A,Column B
1,one,1
2,one,1
5,two,3
7,two,4


In [7]:
data.duplicated(keep='last')

0     True
1     True
2    False
3    False
4     True
5    False
6     True
7    False
dtype: bool

In [8]:
data.drop_duplicates(keep='last')

Unnamed: 0,Column A,Column B
2,one,1
3,one,2
5,two,3
7,two,4


In [9]:
#only return non duplicated rows
data.drop_duplicates(keep=False)

Unnamed: 0,Column A,Column B
3,one,2


In [10]:
#return all duplicated rows (including the original rows)
data[data.duplicated(keep=False)]

Unnamed: 0,Column A,Column B
0,one,1
1,one,1
2,one,1
4,two,3
5,two,3
6,two,4
7,two,4


In [11]:
#number of duplicated rows including originals
data[data.duplicated(keep=False)].shape[0]

7

In [12]:
#Removes duplicates from the dataframe - does not return a copy
data.drop_duplicates(inplace=True)
data

Unnamed: 0,Column A,Column B
0,one,1
3,one,2
4,two,3
6,two,4
