In [2]:
# Chapter 7 Data Cleaning and Preparation
# 7.1 Handling Missing Values
import pandas as pd
import numpy as np
# Create a DataFrame with missing values
float_data = pd.Series([1.2, -3.5, np.nan, 0])
# Fill missing values with the mean of the series
float_data 


0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna() 
# Fill missing values with the mean

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# In pandas, we've adopted a convention used in 
# the R programming language, where the missing value is represented by NaN,
# which stands for "Not a Number". and called NaN (Not a Number) or 
# Not Available.

# In statistics, missing values are often referred to as 
# NA (Not Available).

# When cleaning data, we often need to handle missing values.

float_data.fillna(float_data.mean(), inplace=True)
# Check if there are any missing values 
float_data.isna().any()  # Should return False, indicating no missing values


np.False_

In [5]:
string_data = pd.Series(['cascha',np.nan,'None', 'cassava'])
# Fill missing values with a specific string
string_data  

0     cascha
1        NaN
2       None
3    cassava
dtype: object

In [6]:
float_data = pd.Series([1, 2, None], dtype='float64')

float_data


0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
float_data.isna()  # Check for missing values
# Fill missing values with a specific value

0    False
1    False
2     True
dtype: bool

In [8]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.fillna(0, inplace=True)  # Fill NaN with 0
data

0    1.0
1    0.0
2    3.5
3    0.0
4    7.0
dtype: float64

In [9]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna(inplace=True)  # Drop rows with NaN values
data

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data.dropna()# Drop rows with NaN values
# Fill NaN with a specific value

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notna()]  # Select rows that are not NaN

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data1 = pd.DataFrame([[1., 6.5, 3.],[1.,np.nan, np.nan],
                    [np.nan, np.nan, np.nan],[np.nan, 6.5, 3.]])
data1
# Fill NaN values with the mean of each column


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
data1.dropna()  # Drop rows with any NaN values

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
# passing how ='all' will drop rows where all elements are NaN
data1.dropna(how='all')  # Drop rows where all elements are NaNS

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
data1[4] = np.nan  # Introduce a NaN value
data1 

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data1[5] = [1, 2, 3, 4]  # Add a new column
data1

Unnamed: 0,0,1,2,4,5
0,1.0,6.5,3.0,,1
1,1.0,,,,2
2,,,,,3
3,,6.5,3.0,,4


In [17]:
data1.dropna(axis='columns', how='all') 
# Drop columns with any NaN values

Unnamed: 0,0,1,2,5
0,1.0,6.5,3.0,1
1,1.0,,,2
2,,,,3
3,,6.5,3.0,4


In [18]:
df = pd.DataFrame(np.random.standard_normal((7, 3)), 
                  index=['a', 'b', 'c', 'd', 'e', 'f', 'g'], 
                  columns=['one', 'two', 'three'])
# Introduce some NaN values in the DataFrame 
df.iloc[:4, 1] = np.nan  # Introduce NaN values in the second column
df.iloc[:2, 2] = np.nan  # Introduce NaN values in the third column
df

Unnamed: 0,one,two,three
a,0.520667,,
b,1.13874,,
c,1.044233,,1.39606
d,-0.691471,,0.762483
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [19]:
df.dropna()  # Drop rows with any NaN values

Unnamed: 0,one,two,three
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [20]:
df.dropna(thresh=2)  # Drop rows with less than 2 non-NaN values

Unnamed: 0,one,two,three
c,1.044233,,1.39606
d,-0.691471,,0.762483
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [21]:
df.dropna(thresh=3)


Unnamed: 0,one,two,three
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [None]:
df.dropna(thresh=4)
# Drop rows with less than 4 non-NaN values


Unnamed: 0,one,two,three


In [23]:
# Fill NaN values with the mean of each column
data1.fillna(data1.mean(), inplace=True)  
# Fill NaN with column means
data1

Unnamed: 0,0,1,2,4,5
0,1.0,6.5,3.0,,1
1,1.0,6.5,3.0,,2
2,1.0,6.5,3.0,,3
3,1.0,6.5,3.0,,4


In [24]:
df.fillna(0)

Unnamed: 0,one,two,three
a,0.520667,0.0,0.0
b,1.13874,0.0,0.0
c,1.044233,0.0,1.39606
d,-0.691471,0.0,0.762483
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [25]:
df.fillna({'two': 0.5, 'three': 0})
# Fill NaN values in specific columns with different values

Unnamed: 0,one,two,three
a,0.520667,0.5,0.0
b,1.13874,0.5,0.0
c,1.044233,0.5,1.39606
d,-0.691471,0.5,0.762483
e,-1.432495,0.06363,0.401459
f,0.118348,0.016164,0.898253
g,1.751739,0.137355,1.204697


In [26]:
# The same interpolation methods available 
# for reindexing can be used with fillna.

df1 = pd.DataFrame(np.random.standard_normal((6, 3)),
                   index=['a', 'b', 'c', 'd', 'e', 'f'], 
                   columns=['one', 'two', 'three'])
df1.iloc[2:, 1] = np.nan  
# Introduce NaN values in the second column

df1.iloc[4:, 2] = np.nan
# Introduce NaN values in the third column
df1
# Fill NaN values in the second column with the mean of that column

Unnamed: 0,one,two,three
a,-0.767748,1.172008,2.540245
b,-0.138089,-0.429519,0.715568
c,-0.586316,,1.188167
d,0.601344,,-1.690363
e,0.254933,,
f,0.723913,,


In [27]:
df1.fillna(method='ffill', inplace=True)  
# Forward fill NaN values
df1 

  df1.fillna(method='ffill', inplace=True)


Unnamed: 0,one,two,three
a,-0.767748,1.172008,2.540245
b,-0.138089,-0.429519,0.715568
c,-0.586316,-0.429519,1.188167
d,0.601344,-0.429519,-1.690363
e,0.254933,-0.429519,-1.690363
f,0.723913,-0.429519,-1.690363


In [28]:
df1.fillna(method='ffill', limit=2)
# Forward fill NaN values with a limit of 2
df1

  df1.fillna(method='ffill', limit=2)


Unnamed: 0,one,two,three
a,-0.767748,1.172008,2.540245
b,-0.138089,-0.429519,0.715568
c,-0.586316,-0.429519,1.188167
d,0.601344,-0.429519,-1.690363
e,0.254933,-0.429519,-1.690363
f,0.723913,-0.429519,-1.690363


In [29]:
df1.fillna(df1.mean()['two'], inplace=True)
# Fill NaN values in the third column with the mean of that column
df1

Unnamed: 0,one,two,three
a,-0.767748,1.172008,2.540245
b,-0.138089,-0.429519,0.715568
c,-0.586316,-0.429519,1.188167
d,0.601344,-0.429519,-1.690363
e,0.254933,-0.429519,-1.690363
f,0.723913,-0.429519,-1.690363


In [30]:
data1.fillna(data1.mean(), inplace=True)
# Fill NaN values in the DataFrame with the mean of each column
data1

Unnamed: 0,0,1,2,4,5
0,1.0,6.5,3.0,,1
1,1.0,6.5,3.0,,2
2,1.0,6.5,3.0,,3
3,1.0,6.5,3.0,,4


In [31]:
data.fillna(data.mean(), inplace=True)
# Fill NaN values in the Series with the mean
data
# Fill NaN values in the DataFrame with the mean of each column

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64