# Chapter 7

In [2]:
import sys
import numpy as np
import pandas as pd
import csv
import json
from lxml import objectify
import pyarrow
import openpyxl
import xlrd
from bs4 import BeautifulSoup
import sqlalchemy as sqla
import sqlite3
import os
from pathlib import Path

### Handling Missing Data

In [3]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [4]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [5]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [7]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [8]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [9]:
float_data = pd.Series([1,2,None], dtype='float64')

In [10]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [11]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## NA Handling Objects Methods<br>
**dropna**<br>
*Filter axis labels based on whether the values for each label have missing datd , with varying threshholds for how much missing data to tolerate.* <br>
**fillna**<br>
*Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill".* <br>
**isna**<br>
*Return Boolean values indicating which values are missing/NA.* <br>
**notna**<br>
*Negation of isna, returns True for non_NA values and False for NA values.

### Filtering out missing data

In [12]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [13]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan],[np.nan, 6.5, 3.]])

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [20]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df = pd.DataFrame(np.random.standard_normal((7,3)))

In [22]:
df.iloc[:4, 1] = np.nan

In [23]:
df.iloc[:2, 2] = np.nan

In [24]:
df

Unnamed: 0,0,1,2
0,-0.22683,,
1,-0.008607,,
2,-0.407546,,-0.238661
3,1.569548,,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [25]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [26]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.407546,,-0.238661
3,1.569548,,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


# Filling in Missing Data 

In [27]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.22683,0.0,0.0
1,-0.008607,0.0,0.0
2,-0.407546,0.0,-0.238661
3,1.569548,0.0,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


Calling fillna with a dictionary, you can specify a different fill value for each column.

In [28]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.22683,0.5,0.0
1,-0.008607,0.5,0.0
2,-0.407546,0.5,-0.238661
3,1.569548,0.5,0.103098
4,-0.111287,-0.353035,-0.161954
5,0.553252,0.840681,-0.217661
6,-1.167717,0.928221,2.165731


In [29]:
df = pd.DataFrame(np.random.standard_normal((6,3)))

In [30]:
df.iloc[2:, 1] = np.nan

In [31]:
df.iloc[4:, 2] = np.nan

In [32]:
df

Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,,1.649775
3,-1.265559,,0.549961
4,1.584091,,
5,0.938936,,


In [33]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,1.213784,1.649775
3,-1.265559,1.213784,0.549961
4,1.584091,1.213784,0.549961
5,0.938936,1.213784,0.549961


In [34]:
df.fillna(method='ffill', limit=2)

  df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,0.681099,-0.578097,-0.164609
1,-1.979762,1.213784,-1.280635
2,0.999191,1.213784,1.649775
3,-1.265559,1.213784,0.549961
4,1.584091,,0.549961
5,0.938936,,0.549961


In [35]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### fillna Function Arguments<br>
**value**<br>
*Scalar value or dictionary-like object to use to fill missing values* <br>
**method** <br>
*Interpolation method: one of 'bfill'(backward fill) of 'ffill' (forward fill); default is None.* <br>
**axis** <br>
*Axis to fill on ("index" or "columns"); default is axis= "index"* <br>
**limit** <br>
*For forward and backfilling, maximum number of consecutive periods to fill* 

# Data Transformation<br>
### Removing Duplicates

In [38]:
data = pd.DataFrame({"k1": ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})

In [39]:
data 

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [40]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [42]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
data['v1'] = range(7)

In [44]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [45]:
data.drop_duplicates(subset= ['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [46]:
data.drop_duplicates(['k1', 'k2'], keep= 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping 

In [47]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"], "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [48]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [50]:
meat_to_animal = {
    "bacon": "pig", 
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon"
}

In [51]:
data['animal'] = data['food'].map(meat_to_animal)

In [52]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [54]:
def get_animal(x):
    return meat_to_animal[x]

In [55]:
data['food'].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values
Replace is often simpler and more flexible than fillna.

In [58]:
data = pd.Series([1., -999., 2., -999, -1000., 3.])

In [59]:
data 

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

^ Here the -999. may be sentinal values for missing data. To replace these with NA values that pandas understands, we can use replace, producing a new Series:

In [60]:
data.replace([-999, np.nan])

  data.replace([-999, np.nan])


0       1.0
1       1.0
2       2.0
3       2.0
4   -1000.0
5       3.0
dtype: float64

In [61]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [62]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [63]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [64]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index = ["Ohio", "Colorado", "New York"],
                    columns = ["one", "two", "three", "four"])

In [65]:
def transform(x):
    return x[:4].upper()

In [66]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [67]:
data.index=data.index.map(transform)

In [68]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


data.rename(index=str.title, columns=str.upper)

In [70]:
data.rename(index={"OHIO": "INDIANA"}, columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretizing and Binning

In [71]:
ages= [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [72]:
bins = [18, 25, 35, 60, 100]

In [73]:
age_categories = pd.cut(ages, bins)

In [74]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [75]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [76]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [77]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [78]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

^bin counts for pandas.cut.
In the string representation of an interval, a parenthesis means that the side is open (exclusive), while the square bracket means it is closed(inclusive). You can change which side is closed by passing right = False) v

In [86]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [87]:
group_names=["Youth", "YoungAdult", "MiddleAged", "Senior"]

You can override interval based bin labeling by passing a list or array to the labels option:

In [88]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

If you pass an integer number of bins to pandas.cut instead of explicit bin edges, it will compute equal length bins based on the minimum and maximum values in the data. Consider the case of some uniformly distributed data chopped into fourths:  

In [89]:
data = np.random.uniform(size=20)

In [90]:
pd.cut(data, 4, precision=2)

[(0.28, 0.51], (0.74, 0.97], (0.74, 0.97], (0.052, 0.28], (0.74, 0.97], ..., (0.052, 0.28], (0.052, 0.28], (0.74, 0.97], (0.74, 0.97], (0.51, 0.74]]
Length: 20
Categories (4, interval[float64, right]): [(0.052, 0.28] < (0.28, 0.51] < (0.51, 0.74] < (0.74, 0.97]]

For roughly similar sized bins, use qcut to take quantiles instead.

In [91]:
data = np.random.standard_normal(1000)

In [92]:
quartiles = pd.qcut(data, 4, precision=2)

In [93]:
quartiles

[(-0.69, 0.04], (0.04, 0.75], (0.04, 0.75], (0.04, 0.75], (0.75, 2.93], ..., (-0.69, 0.04], (-3.1999999999999997, -0.69], (0.75, 2.93], (-3.1999999999999997, -0.69], (-0.69, 0.04]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.1999999999999997, -0.69] < (-0.69, 0.04] < (0.04, 0.75] < (0.75, 2.93]]

In [96]:
pd.Series(quartiles).value_counts()

(-3.1999999999999997, -0.69]    250
(-0.69, 0.04]                   250
(0.04, 0.75]                    250
(0.75, 2.93]                    250
Name: count, dtype: int64

In [97]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.1879999999999997, -1.325]    100
(-1.325, 0.0402]                 400
(0.0402, 1.295]                  400
(1.295, 2.927]                   100
Name: count, dtype: int64

### Detecting and Filtering Outliers

In [98]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [99]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.057642,0.007059,-0.007049,0.038083
std,1.024178,0.990612,0.9848,0.94598
min,-3.9905,-3.208061,-2.721879,-2.654558
25%,-0.749974,-0.597569,-0.69547,-0.614773
50%,-0.09581,-0.048286,0.002649,0.003011
75%,0.611922,0.652774,0.704578,0.683489
max,2.889135,3.053203,3.550356,3.312186


In [100]:
col = data[2]

In [101]:
col[col.abs()>3]

3    3.550356
Name: 2, dtype: float64

In [102]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
3,-0.900825,0.289008,3.550356,1.491059
159,0.18242,-3.195733,0.204998,0.09777
309,1.053218,0.959529,-1.647009,3.312186
442,-0.247107,-3.062344,0.099928,-1.854886
456,-3.070019,0.337334,-0.56876,-1.420674
539,-0.627604,3.053203,0.610364,-0.937053
606,-3.165637,0.094089,1.308917,0.858654
691,0.636187,-3.206091,-0.021037,-1.645256
750,1.18429,-3.208061,-1.357506,-0.984314
759,-3.9905,0.779391,-0.026307,-1.409274


In [103]:
data[data.abs() > 3] = np.sign(data)*3

In [104]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.056416,0.007678,-0.0076,0.037771
std,1.020148,0.98834,0.982962,0.94495
min,-3.0,-3.0,-2.721879,-2.654558
25%,-0.749974,-0.597569,-0.69547,-0.614773
50%,-0.09581,-0.048286,0.002649,0.003011
75%,0.611922,0.652774,0.704578,0.683489
max,2.889135,3.0,3.0,3.0


In [105]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,-1.0
1,-1.0,-1.0,1.0,-1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,1.0
4,-1.0,1.0,-1.0,1.0


### Permutation and Random Sampling

In [107]:
df = pd.DataFrame(np.arange(5*7).reshape((5, 7)))

In [108]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [109]:
sampler = np.random.permutation(5)

In [110]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [111]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [112]:
column_sampler = np.random.permutation(7)

In [113]:
column_sampler

array([0, 5, 3, 1, 6, 4, 2])

In [114]:
df.take(column_sampler, axis="columns")

Unnamed: 0,0,5,3,1,6,4,2
0,0,5,3,1,6,4,2
1,7,12,10,8,13,11,9
2,14,19,17,15,20,18,16
3,21,26,24,22,27,25,23
4,28,33,31,29,34,32,30


In [115]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20


In [116]:
choices = pd.Series([5,7,-1,6,4])

In [117]:
choices.sample(n=10, replace=True)

3    6
2   -1
0    5
2   -1
2   -1
0    5
1    7
3    6
4    4
4    4
dtype: int64

### Computing Indicator/Dummy Variable

In [118]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  "data1": range(6)})

In [119]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [120]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [121]:
dummies = pd.get_dummies(df['key'], prefix="key")

In [124]:
df_with_dummy = df[['data1']].join(dummies)

In [125]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


In [126]:
mnames = ["movie_id", "title", "genres"]

^Missing the movie data

In [128]:
np.random.seed(12345)

In [129]:
values = np.random.uniform(size=10)

In [130]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [131]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [None]:
pd.get_dummies