# Removing the duplicates

Using Pandas dataframe.drop_duplicates()

In [8]:
import pandas as pd

data={
    "Name":['Ngima','Rabi','Ngima','Himal'],
    "Age":[21,24,21,27],
    "City":['Ktm','Bkt','Ktm','Chicago']
}
df=pd.DataFrame(data)
print(f"Original DataFrame:\n{df}")

df_cleaned=df.drop_duplicates()
print(f"\nNo duplicate DataFrame:\n{df_cleaned}")

Original DataFrame:
    Name  Age     City
0  Ngima   21      Ktm
1   Rabi   24      Bkt
2  Ngima   21      Ktm
3  Himal   27  Chicago

No duplicate DataFrame:
    Name  Age     City
0  Ngima   21      Ktm
1   Rabi   24      Bkt
3  Himal   27  Chicago


Note: drop_duplicates() removes completely identical rows.

In [9]:
# Dropping Duplicates Based on Specific Columns

import pandas as pd

data={
    "Name":['Ngima','Rabi','Ngima','Himal'],
    "Age":[21,24,21,27],
    "City":['Ktm','Bkt','Lalitpur','Chicago']
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

df_cleaned = df.drop_duplicates(subset=['Name'])

print("\nModified DataFrame (no duplicates)")
print(df_cleaned)

Original DataFrame:
    Name  Age      City
0  Ngima   21       Ktm
1   Rabi   24       Bkt
2  Ngima   21  Lalitpur
3  Himal   27   Chicago

Modified DataFrame (no duplicates)
    Name  Age     City
0  Ngima   21      Ktm
1   Rabi   24      Bkt
3  Himal   27  Chicago


Note: Here duplicates are removed only based on the Name column while Age and City are ignored for the purpose of removing duplicates.

In [11]:
#  Keeping the Last Occurrence of Duplicates

import pandas as pd

data={
    "Name":['Ngima','Rabi','Ngima','Himal'],
    "Age":[21,24,21,27],
    "City":['Ktm','Bkt','Ktm','Chicago']
}

df = pd.DataFrame(data)
df_cleaned = df.drop_duplicates(keep='last')

print("\nModified DataFrame (no duplicates)")
print(df_cleaned)


Modified DataFrame (no duplicates)
    Name  Age     City
1   Rabi   24      Bkt
2  Ngima   21      Ktm
3  Himal   27  Chicago


Here the last occurrence of Ngima is kept and the first occurrence is removed.

In [12]:
#  Dropping All Duplicates

import pandas as pd

data={
    "Name":['Ngima','Rabi','Ngima','Himal'],
    "Age":[21,24,21,27],
    "City":['Ktm','Bkt','Ktm','Chicago']
}

df = pd.DataFrame(data)
df_cleaned = df.drop_duplicates(keep=False)

print("\nModified DataFrame (no duplicates)")
print(df_cleaned)


Modified DataFrame (no duplicates)
    Name  Age     City
1   Rabi   24      Bkt
3  Himal   27  Chicago


Note: With keep=False both occurrences of Ngima are removed leaving only the rows with unique values across all columns.

In [13]:
#. Modifying the Original DataFrame Directly

import pandas as pd

data={
    "Name":['Ngima','Rabi','Ngima','Himal'],
    "Age":[21,24,21,27],
    "City":['Ktm','Bkt','Ktm','Chicago']
}

df = pd.DataFrame(data)
df.drop_duplicates(inplace=True)

print("\nModified DataFrame (no duplicates)")
print(df)


Modified DataFrame (no duplicates)
    Name  Age     City
0  Ngima   21      Ktm
1   Rabi   24      Bkt
3  Himal   27  Chicago


Note: Using inplace=True directly modifies the original DataFrame saving memory and avoiding the need to assign the result to a new variable.