# Missing Data

In [1]:
import pandas as pd
import numpy as np

In [4]:
sales = [0, 5, 155, np.nan, 518]   #np.nan create a NaN value

In [6]:
sales_series = pd.Series(sales, name="Sales")
sales_series

0      0.0
1      5.0
2    155.0
3      NaN
4    518.0
Name: Sales, dtype: float64

In [8]:
sales_series + 2  #Arithmetic operation performed on NaN values will return NaN

0      2.0
1      7.0
2    157.0
3      NaN
4    520.0
Name: Sales, dtype: float64

In [11]:
sales_series.add(2, fill_value=0)  # 'fill_value' argument that lets you pass a value instead of NaN

0      2.0
1      7.0
2    157.0
3      2.0
4    520.0
Name: Sales, dtype: float64

Panda released its own 'Missing data type' 'NA', in Dec -2020
        
            This allow missing values to be stored as integers, instead of needing to convert to float
            This still new feature, but most bugs end up converting data to NumPy 'NaN'

In [12]:
sales = [0, 5, 210, pd.NA, 576]

In [13]:
ss = pd.Series(sales, name='Sales Record')

In [14]:
ss

0       0
1       5
2     210
3    <NA>
4     576
Name: Sales Record, dtype: object

# Identifying Missing Data

The '.isna()' and '.value_counts()' methods are used to identify missing data in Series

In [38]:
cl = ['Complete', np.nan, np.nan, np.nan, 'Complete', 'Incomplete', 'Incomplete', 'Complete']

In [39]:
cls = pd.Series(cl)

In [40]:
cls

0      Complete
1           NaN
2           NaN
3           NaN
4      Complete
5    Incomplete
6    Incomplete
7      Complete
dtype: object

In [41]:
cls.isna()    # '.isna()' method return True if a value is missing otherwise False

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7    False
dtype: bool

In [42]:
cls.isna().sum()   # '.sum()' methods returns the count of missing (NaN) values

3

In [43]:
cls.value_counts()  # '.value_counts()' method retur the unique values & their frequecy

Complete      3
Incomplete    2
dtype: int64

In [46]:
cls.value_counts(dropna=False)   # dropna=False to returs the count of NaN values

NaN           3
Complete      3
Incomplete    2
dtype: int64

# Handling Missing Data

'.dropna()' method is used to remove NaN


'.fillna(arguent)' method is used to replace NaN with specified value

In [48]:
data = [256, 'IPL', np.nan, 'Sachin', np.nan, np.nan, 254, 369, 'Srilanka']

In [49]:
ps = pd.Series(data)

In [51]:
ps

0         256
1         IPL
2         NaN
3      Sachin
4         NaN
5         NaN
6         254
7         369
8    Srilanka
dtype: object

In [53]:
# identify is there any Null value
ps.isna().sum()

3

In [55]:
#drop all NaN 
k = ps.dropna()

In [56]:
k

0         256
1         IPL
3      Sachin
6         254
7         369
8    Srilanka
dtype: object

In [58]:
k.isna().sum()

0

In [60]:
#replace NaN with String 'Hyderabad'
x = ps.fillna('Hyderabad')

In [61]:
x

0          256
1          IPL
2    Hyderabad
3       Sachin
4    Hyderabad
5    Hyderabad
6          254
7          369
8     Srilanka
dtype: object

In [63]:
x.isna().sum()

0