Detecting Null Values

In [1]:
import pandas as pd
df=pd.DataFrame({
    'Name':['sailesh','Thrinadh',None,'Nathaniel','Gnan'],
    'Age':[21,22,21,None,23],
    'City':['Hyd','Chennai','gachibowli','Guntur',None]
})
print(df.isnull())

    Name    Age   City
0  False  False  False
1  False  False  False
2   True  False  False
3  False   True  False
4  False  False   True


Counting Nulls in a single column

In [2]:
print(df.isnull().sum())

Name    1
Age     1
City    1
dtype: int64


Counting Nulls in entire data

In [3]:
print(df.isnull().sum().sum())

3


How to check Null values in the data

In [4]:
print(df.isnull().values.any())

True


Handling Null Values


Dropping Null Values dropna()

In [None]:
#drops any row with atleast one null
df.dropna()

Unnamed: 0,Name,Age,City
0,sailesh,21.0,Hyd
1,Thrinadh,22.0,Chennai


In [None]:
#drops cloumns with nulls
df.dropna(axis=1)

0
1
2
3
4


In [8]:
#Drop rows only if all values are null:

df.dropna(how='all')

Unnamed: 0,Name,Age,City
0,sailesh,21.0,Hyd
1,Thrinadh,22.0,Chennai
2,,21.0,gachibowli
3,Nathaniel,,Guntur
4,Gnan,23.0,


In [9]:
#Drop rows with less than certain number of non-null values:

df.dropna(thresh=2)

Unnamed: 0,Name,Age,City
0,sailesh,21.0,Hyd
1,Thrinadh,22.0,Chennai
2,,21.0,gachibowli
3,Nathaniel,,Guntur
4,Gnan,23.0,


In [10]:
#Drop rows with null in specific columns:

df.dropna(subset=['Age'])

Unnamed: 0,Name,Age,City
0,sailesh,21.0,Hyd
1,Thrinadh,22.0,Chennai
2,,21.0,gachibowli
4,Gnan,23.0,


Filling Nul Values: fillna()


In [14]:
#Replace nulls with a specific value.

df['Age'].fillna(0.0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(0.0, inplace=True)


In [15]:
print(df)

        Name   Age        City
0    sailesh  21.0         Hyd
1   Thrinadh  22.0     Chennai
2       None  21.0  gachibowli
3  Nathaniel   0.0      Guntur
4       Gnan  23.0        None


In [16]:
#Fill with mean, median, or mode (useful for numerical data):

mean_age = df['Age'].mean()
df['Age'].fillna(mean_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(mean_age, inplace=True)


In [17]:
print(df)

        Name   Age        City
0    sailesh  21.0         Hyd
1   Thrinadh  22.0     Chennai
2       None  21.0  gachibowli
3  Nathaniel   0.0      Guntur
4       Gnan  23.0        None


Identifying and removing duplicates in the data

Detect duplicates with duplicated()

In [18]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'David'],
    'Age': [25, 30, 25, 40]
})

print(df.duplicated())

0    False
1    False
2     True
3    False
dtype: bool


Remove duplicates with drop_duplicates()

In [19]:
df.drop_duplicates()

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
3,David,40


In [20]:
#To keep the last occurrence:

df.drop_duplicates(keep='last')


Unnamed: 0,Name,Age
1,Bob,30
2,Alice,25
3,David,40


In [21]:
#Remove duplicates based on subset of columns:

df.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
3,David,40


Renaming columns with rename()

In [22]:
df = pd.DataFrame({
    'Nm': ['Alice', 'Bob'],
    'Ag': [25, 30]
})

df.rename(columns={'Nm': 'Name', 'Ag': 'Age'}, inplace=True)
print(df)

    Name  Age
0  Alice   25
1    Bob   30


Example 1: Detect and Fill Nulls

In [23]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, None, 35, 40]
})

print("Before filling nulls:")
print(df)

df['Name'].fillna('Unknown', inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace=True)

print("nAfter filling nulls:")
print(df)

Before filling nulls:
    Name   Age
0  Alice  25.0
1    Bob   NaN
2   None  35.0
3  David  40.0
nAfter filling nulls:
      Name        Age
0    Alice  25.000000
1      Bob  33.333333
2  Unknown  35.000000
3    David  40.000000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


Example 2: Remove duplicate rows based on ‘Name’

In [24]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'David'],
    'Age': [25, 30, 25, 40]
})

print("Before removing duplicates:")
print(df)

df = df.drop_duplicates(subset=['Name'])
print("nAfter removing duplicates:")
print(df)

Before removing duplicates:
    Name  Age
0  Alice   25
1    Bob   30
2  Alice   25
3  David   40
nAfter removing duplicates:
    Name  Age
0  Alice   25
1    Bob   30
3  David   40


Example 3: Rename columns for clarity

In [25]:
import pandas as pd
df = pd.DataFrame({
    'Nm': ['Alice', 'Bob'],
    'Ag': [25, 30]
})

print("Before renaming:")
print(df)

df.rename(columns={'Nm': 'Name', 'Ag': 'Age'}, inplace=True)

print("nAfter renaming:")
print(df)

Before renaming:
      Nm  Ag
0  Alice  25
1    Bob  30
nAfter renaming:
    Name  Age
0  Alice   25
1    Bob   30


In [5]:
df.copy()

Unnamed: 0,Name,Age,City
0,sailesh,21.0,Hyd
1,Thrinadh,22.0,Chennai
2,,21.0,gachibowli
3,Nathaniel,,Guntur
4,Gnan,23.0,
