# pandas lesson 4 (Handle Missing Data)

The isnull() and isna() methods will detect missing data.  These have corresponding notnull() and notna() methods.  
The fillna() and dropna() can correct or remove missing data,


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # pandas uses matplotlib for plotting

In these examples we will use a DataFrame of car models.  This  has missing values for some rows in three of the columns: price, country_of_manufacture and is_electric.  This is fictitious data.

In [None]:
data = {
    "make": ["Toyota", "Toyota", "Toyota", "Toyota", 
             "Honda", "Honda", "Honda", "Honda",
             "Ford", "Ford", "Ford", "Ford",
             "Chevrolet", "Chevrolet", "Chevrolet", "Chevrolet",
             "Nissan", "Nissan", "Nissan", "Nissan"],
    "model": ["Camry", "Corolla", "Prius", "Passat",
              "Civic", "Accord", "CR-V", "Cortina",
              "F-150", "Mustang", "Beetle", "Explorer",
              "Malibu", "Tahoe", "Impala", "Camaro",
              "Altima", "Sentra", "Maxima", "Frontier"],
    "price": [24000, 20000, 25000, None,
              22000, 26000, 30000, None,
              30000, 35000, None, 32000,
              22000, 50000, 28000, None,
              24000, 19000, 35000, None],
    "country_of_manufacture": ["Japan", "Europe", "Japan", None,
                               "Japan", "Japan", "Europe", None,
                               "USA", "USA", None, "USA",
                               "USA", "USA", "USA", None,
                               "Japan", "Japan", "Japan", None],
    "is_electric": [True, False, True, None,
                    False, False, False, None,
                    False, False, None, True,
                    False, False, False, None,
                    False, False, True, None]
}

df = pd.DataFrame(data)
df


Find missing data

In [None]:
df.isna().head() # show the first 5 rows of the dataframe with True/False values for missing data

In [None]:
df.isna().sum() # count the number of missing values in each column

In [None]:
mean_price = df["price"].mean() # calculate the mean of the price column
mean_price

In [None]:
# use the mean to fill in the missing values of the price column
df["price"] = df["price"].fillna(mean_price)
df.head()

In [None]:
# use 'Unknown' to fill in the missing values of the country_of_manufacture column
df["country_of_manufacture"] = df["country_of_manufacture"].fillna("Unknown")

In [None]:
# drop the rows where is_electric is missing
df = df.dropna(subset=["is_electric"])
df