# Handling Missing Data in Pandas

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np

# Create a pandas Series object called 's' containing a sequence of values. The values in the Series should be "Sam", NaN (which stands for 'Not a Number'), "Tim", and "Kim". 

In [2]:
s = pd.Series(["Sam", np.nan, "Tim", "Kim"])
s

0    Sam
1    NaN
2    Tim
3    Kim
dtype: object

# Check for missing or null values in the pandas Series object 's'.

In [3]:
s.isnull()

0    False
1     True
2    False
3    False
dtype: bool

# Check for non-null or valid values in the pandas Series object 's'

In [4]:
s.notnull()

0     True
1    False
2     True
3     True
dtype: bool

# Assign the value None to the fourth element in the pandas Series object 's', which corresponds to the index label 3. After that check for null values

In [5]:
s[3] = None
print(s)
s.isnull()

0     Sam
1     NaN
2     Tim
3    None
dtype: object


0    False
1     True
2    False
3     True
dtype: bool

# Remove any null values from the pandas Series object 's'.

In [6]:
s.dropna()

0    Sam
2    Tim
dtype: object

# Import the nan value from the NumPy library and assign it the alias NA.

In [7]:
from numpy import nan as NA

# Create a pandas DataFrame object called 'df' with three rows and three columns. The DataFrame is initialized with a nested list containing the following values: [1,2,3], [4,NA,5], [NA,NA,NA]

In [22]:
df = pd.DataFrame([[1,2,3], [4,NA,5], [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


# Remove rows from the pandas DataFrame 'df' that contain any missing or null values. 

In [24]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


# Assign the value NA to the second column (column with index label 1) in the pandas DataFrame 'df'.

In [25]:
df[1] = NA
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


# Remove columns from the pandas DataFrame 'df' that contain only missing or null values. 

In [26]:
df.dropna(axis = 1, how= "all")

Unnamed: 0,0,2
0,1.0,3.0
1,4.0,5.0
2,,


# Remove rows from the pandas DataFrame 'df' that have less than three non-missing values. 

In [27]:
df1 = df.dropna(thresh=3)
df1

Unnamed: 0,0,1,2


# Fill missing or null values in the pandas DataFrame 'df' with the value 0

In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.0,0.0,3.0
1,4.0,0.0,5.0
2,0.0,0.0,0.0


# Fill missing or null values in the pandas DataFrame 'df' with specific values provided in a dictionary.

In [29]:
df.fillna({0:15, 1:25, 2:35})

Unnamed: 0,0,1,2
0,1.0,25.0,3.0
1,4.0,25.0,5.0
2,15.0,25.0,35.0


# Fill missing or null values in the pandas DataFrame 'df' with the value 0, directly modifying the original DataFrame without creating a new one.

In [30]:
df

Unnamed: 0,0,1,2
0,1.0,,3.0
1,4.0,,5.0
2,,,


In [31]:
df.fillna(0 , inplace=True)
df

Unnamed: 0,0,1,2
0,1.0,0.0,3.0
1,4.0,0.0,5.0
2,0.0,0.0,0.0


# Create a pandas DataFrame object called 'df' with three rows and three columns. The DataFrame is initialized with a nested list containing the following values: [1,2,3], [4,NA,5], [NA,NA,NA]

In [32]:
df = pd.DataFrame([[1,2,3], [4,NA,5], [NA,NA,NA]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,,5.0
2,,,


# Fill missing or null values in the pandas DataFrame 'df' using the forward fill method. Replace the missing values with the last observed non-null value in the same column.

In [33]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,2.0,5.0


# Fill missing or null values in the pandas DataFrame 'df' using the forward fill method, but with a limit on the number of consecutive missing values to be filled.

In [34]:
df.fillna(method="ffill", limit = 1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,4.0,,5.0


# Create a pandas Series object called 'data' with four elements [1,0,NA,5]

In [35]:
data = pd.Series([1,0,NA,5])
data

0    1.0
1    0.0
2    NaN
3    5.0
dtype: float64

# Fill the missing or null values in the pandas Series 'data' with the mean value of the non-missing values. 

In [36]:
data.fillna(data.mean())

0    1.0
1    0.0
2    2.0
3    5.0
dtype: float64

# TASK : 
# Fill the missing or null values in the pandas DataFrame 'df' with the mean value of each respective column. Replace the missing values with the calculated mean for each column.

In [37]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,4.0,2.0,5.0
2,2.5,2.0,4.0
