## Handling Missing Data 

In [9]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn 
import scipy 
from pathlib import Path


Creating functions that we will work with

* read_dataset
* plot_dfs
* calculate RMSE score

In [7]:
# Function: read_dataset 
def read_dataset(folder, file, date_col=None):
    """
    folder: is a Path object 
    file: the CSV filename
    date_col: specify a date_col to ues for index_col 

    returns: a pandas DataFrame with a DatetimeIndex 
    """

    df = pd.read_csv(folder / file, 
                     index_col=date_col, 
                     parse_dates=[date_col])
    
    return df 

def plot_dfs(df1, df2, col, title=None, xlabel=None, ylabel=None): 
    """ 
    df1: original dataframe without missing data
    df2: dataframe with missing data
    col: column name that contains missing data 
    """
    df_missing = df2.rename(columns={col: 'missing'})

    columns = df_missing.loc[:, 'missing':].columns.tolist()
    subplots_size = len(columns)
    fig, ax = plt.subplots(subplots_size+1, 1, sharex=True)
    plt.subplots_adjust(hspace=0.25)
    fig.suptitle = title 

    df1[col].plot(ax=ax[0], figsize=(10, 12))
    ax[0].set_title('Original Dataset')
    ax[0].set_xlabel(xlabel)
    ax[0].set_ylabel(ylabel)

    for i, colname in enumerate(columns):
        df_missing[colname].plot(ax=ax[i+1])
        ax[i+1].set_title(colname.upper())
    
    plt.show()

def rmse_score(df1, df2, col=None):
    """ 
    df1: original dataframe without mising data
    df2: dataframe with missing data 
    col: column name that contains missing data 
    returns: a list of scores
    """

    df_missing = df2.rename(columns={col: 'missing'})
    columns = df_missing.loc[:, 'missing':].columns.tolist()
    scores = []
    for comp_col in columns[1:]: 
        rmse = np.sqrt(np.mean((df1[col] - df_missing[comp_col])**2))
        scores.append(rmse)
        print(f'RMSE for {comp_col}: {rmse}')
    return scores 

#### Understanding missing data 

* In this chapter we will use the RMSE to evaluate the different imputation techniques


##### Performing data quality checks 
* Missing data - values not captured or observed in the dataset 
 - when ingesting the data using pandas, missing values will show up as either Nan, NaT or NA
  

In [10]:
# Reading data 
co2 = Path('../TimeSeriesAnalysisWithPythonCookbook/Data/co2_missing.csv')
ecom = Path('../TimeSeriesAnalysisWithPythonCookbook/Data/clicks_missing.csv')

In [17]:
co2_df = pd.read_csv(co2, index_col='year', parse_dates=True)
ecom_df = pd.read_csv(ecom, index_col='date', parse_dates=True)

Unnamed: 0_level_0,price,location,clicks
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-04-01,43.155647,2,18784.0
2008-04-02,43.079056,1,24738.0
2008-04-03,43.842609,2,15209.0
2008-04-04,43.312376,1,14018.0
2008-04-05,43.941176,1,11974.0
2008-04-06,44.403936,1,11007.0
2008-04-07,43.995888,2,15214.0
2008-04-08,43.373773,1,11333.0
2008-04-09,43.320312,1,7026.0
2008-04-10,43.154738,5,15677.0


In [20]:
# To count the number of missing values in both DataFrames, we can use the DataFrame.isnull()
# - this will return True (if missing) or False (if not missing) for each value! 

#Try - Booleans 
isinstance(True, int)
int(True)

1

In [28]:
# Get the total number of missing values for each DataFrame 
#co2_df.isna().sum()

ecom_df.isnull().sum()

price        0
location     0
clicks      16
dtype: int64

* Notice: .isna() and .isnull() can be used interchangeably - this means that they are are an alias of each other! 
* form the results co2 has 25 missing values from the co2 column
* ecom_df has 19 missing values in total (3 from date, 1 from price, 1 from location, 14 from clicks)

In [27]:
# To get the grand total for the entire ecom_df DataFrame, just chain another .sum() function to the end of the statement
ecom_df.isnull().sum().sum()

16

In [30]:
co2_df[190:195]

Unnamed: 0_level_0,co2
year,Unnamed: 1_level_1
1985-01-01,
1986-01-01,
1987-01-01,
1988-01-01,4.2953
1989-01-01,4.2782


In [32]:
# Check whether the DataFrame contains any missing values 
ecom_df.isnull().values.any()
co2_df.isnull().values.any()

True

In [33]:
# Use DataFrame.info() to display the schema, total records, columns names, columns dtypes, count of non-missing values per columnn, index dtype and the DataFrame's total memory usage:
ecom_df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 135 entries, 2008-04-01 to 2008-08-13
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     135 non-null    float64
 1   location  135 non-null    int64  
 2   clicks    119 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 4.2 KB


In [34]:
# To get the summary statistics for a DataFame, use the DataFrame.describe() method
co2_df.describe(include='all', datetime_is_numeric=True)


Unnamed: 0,co2
count,201.0
mean,1.590015
std,1.644182
min,0.0
25%,0.0764
50%,0.9351
75%,2.8076
max,4.9079


In [35]:
ecom_df.describe(include='all', datetime_is_numeric=True) 

Unnamed: 0,price,location,clicks
count,135.0,135.0,119.0
mean,43.478978,1.696296,9530.336134
std,0.608467,1.114853,4687.587507
min,42.207018,1.0,2044.0
25%,43.045714,1.0,6438.0
50%,43.487069,1.0,8391.0
75%,43.886875,2.0,11363.5
max,45.801613,5.0,29505.0


In [37]:
# Convert the 0 and ? values to NaN types. This can be accomplished using the DataFrame.replace() method
co2_df.replace(0, np.NaN, inplace=True)


Unnamed: 0_level_0,co2
year,Unnamed: 1_level_1
1750-01-01,0.0125
1760-01-01,0.0128
1770-01-01,0.0150
1780-01-01,0.0169
1790-01-01,0.0206
...,...
2016-01-01,4.7496
2017-01-01,4.7595
2018-01-01,4.8022
2019-01-01,4.7582


In [39]:
ecom_df.replace('?', np.NaN, inplace=True) 


Unnamed: 0_level_0,price,location,clicks
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-04-01,43.155647,2,18784.0
2008-04-02,43.079056,1,24738.0
2008-04-03,43.842609,2,15209.0
2008-04-04,43.312376,1,14018.0
2008-04-05,43.941176,1,11974.0
...,...,...,...
2008-08-09,44.182033,1,6716.0
2008-08-10,43.608260,1,9523.0
2008-08-11,43.553363,1,8881.0
2008-08-12,44.500469,1,7272.0


In [41]:
ecom_df['clicks'] = ecom_df['clicks'].astype('float')
ecom_df

Unnamed: 0_level_0,price,location,clicks
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-04-01,43.155647,2,18784.0
2008-04-02,43.079056,1,24738.0
2008-04-03,43.842609,2,15209.0
2008-04-04,43.312376,1,14018.0
2008-04-05,43.941176,1,11974.0
...,...,...,...
2008-08-09,44.182033,1,6716.0
2008-08-10,43.608260,1,9523.0
2008-08-11,43.553363,1,8881.0
2008-08-12,44.500469,1,7272.0


In [46]:
# Convert 0 and ? values to NaN types. This can be accomplished using the DataFrame.replace() method:

#co2_df.replace(0, np.NaN, inplace=True)
#ecom_df.replace('?', np.NaN, inplace=True)
ecom_df['click'] = ecom_df['clicks'].astype('float')


In [47]:
co2_df.isnull().sum()

co2    35
dtype: int64

In [49]:
# If we know that the data will always contain a NaN value (like '?') which should be converted to NaN (or any other value), the we can utilize the pd.read_csv() function and update the na_values parameter
pd.read_csv(ecom, parse_dates=['date'], na_values={'?'})

# - this will replace all instance of '?' with NaN

Unnamed: 0,date,price,location,clicks
0,2008-04-01,43.155647,2,18784.0
1,2008-04-02,43.079056,1,24738.0
2,2008-04-03,43.842609,2,15209.0
3,2008-04-04,43.312376,1,14018.0
4,2008-04-05,43.941176,1,11974.0
...,...,...,...,...
130,2008-08-09,44.182033,1,6716.0
131,2008-08-10,43.608260,1,9523.0
132,2008-08-11,43.553363,1,8881.0
133,2008-08-12,44.500469,1,7272.0


## Handling Missing data with Univariate imputation using Pandas 

There are generally two approaches to imputing missing values: 
* univariate imputation 
* multivariate imputation 

Some basic univariate imputation techniques include the following: 
* Imputing using the _mean_ 
* Imputing using the last observation forward (**forward fill**). This can be referred to as **Last Observation Carried Forward (LOCF)**
* Imputing using the next observation backward (**backward fill**). This can be referred to as Next Observation Carried Backward (NOCB)

