# Chapter 7

In [2]:
import sys
import numpy as np
import pandas as pd
import csv
import json
from lxml import objectify
import pyarrow
import openpyxl
import xlrd
from bs4 import BeautifulSoup
import sqlalchemy as sqla
import sqlite3
import os
from pathlib import Path

### Handling Missing Data

In [3]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [4]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [7]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [8]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [9]:
float_data = pd.Series([1,2,None], dtype='float64')

In [10]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [11]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## NA Handling Objects Methods<br>
**dropna**<br>
*Filter axis labels based on whether the values for each label have missing datd , with varying threshholds for how much missing data to tolerate.* <br>
**fillna**<br>
*Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill".* <br>
**isna**<br>
*Return Boolean values indicating which values are missing/NA.* <br>
**notna**<br>
*Negation of isna, returns True for non_NA values and False for NA values.

### Filtering out missing data

In [12]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [13]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan],[np.nan, 6.5, 3.]])

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df = pd.DataFrame(np.random.standard_normal((7,3)))

In [22]:
df.iloc[:4, 1] = np.nan

In [23]:
df.iloc[:2, 2] = np.nan

In [24]:
df

Unnamed: 0,0,1,2
0,-0.22683,,
1,-0.008607,,
2,-0.407546,,-0.238661
3,1.569548,,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [25]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [26]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.407546,,-0.238661
3,1.569548,,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


# Filling in Missing Data 

In [27]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.22683,0.0,0.0
1,-0.008607,0.0,0.0
2,-0.407546,0.0,-0.238661
3,1.569548,0.0,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


Calling fillna with a dictionary, you can specify a different fill value for each column.

In [28]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.22683,0.5,0.0
1,-0.008607,0.5,0.0
2,-0.407546,0.5,-0.238661
3,1.569548,0.5,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [29]:
df = pd.DataFrame(np.random.standard_normal((6,3)))

In [30]:
df.iloc[2:, 1] = np.nan

In [31]:
df.iloc[4:, 2] = np.nan

In [32]:
df

Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,,1.649775
3,-1.265559,,0.549961
4,1.584091,,
5,0.938936,,


In [33]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,1.213784,1.649775
3,-1.265559,1.213784,0.549961
4,1.584091,1.213784,0.549961
5,0.938936,1.213784,0.549961


In [34]:
df.fillna(method='ffill', limit=2)

  df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,1.213784,1.649775
3,-1.265559,1.213784,0.549961
4,1.584091,,0.549961
5,0.938936,,0.549961


In [35]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### fillna Function Arguments<br>
**value**<br>
*Scalar value or dictionary-like object to use to fill missing values* <br>
**method** <br>
*Interpolation method: one of 'bfill'(backward fill) of 'ffill' (forward fill); default is None.* <br>
**axis** <br>
*Axis to fill on ("index" or "columns"); default is axis= "index"* <br>
**limit** <br>
*For forward and backfilling, maximum number of consecutive periods to fill* 

# Data Transformation<br>
### Removing Duplicates

In [38]:
data = pd.DataFrame({"k1": ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})

In [39]:
data 

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [40]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [42]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
data['v1'] = range(7)

In [44]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [45]:
data.drop_duplicates(subset= ['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [46]:
data.drop_duplicates(['k1', 'k2'], keep= 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping 

In [47]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"], "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [48]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [50]:
meat_to_animal = {
    "bacon": "pig", 
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon"
}

In [51]:
data['animal'] = data['food'].map(meat_to_animal)

In [52]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [54]:
def get_animal(x):
    return meat_to_animal[x]

In [55]:
data['food'].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values
Replace is often simpler and more flexible than fillna.

In [58]:
data = pd.Series([1., -999., 2., -999, -1000., 3.])

In [59]:
data 

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

^ Here the -999. may be sentinal values for missing data. To replace these with NA values that pandas understands, we can use replace, producing a new Series:

In [60]:
data.replace([-999, np.nan])

  data.replace([-999, np.nan])


0       1.0
1       1.0
2       2.0
3       2.0
4   -1000.0
5       3.0
dtype: float64

In [61]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [62]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [63]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64