# Chapter 7

In [1]:
import sys
import numpy as np
import pandas as pd
import csv
import json
from lxml import objectify
import pyarrow
import openpyxl
import xlrd
from bs4 import BeautifulSoup
import sqlalchemy as sqla
import sqlite3
import os
from pathlib import Path

### Handling Missing Data

In [2]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [3]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [6]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [8]:
float_data = pd.Series([1,2,None], dtype='float64')

In [9]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [10]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

## NA Handling Objects Methods<br>
**dropna**<br>
*Filter axis labels based on whether the values for each label have missing datd , with varying threshholds for how much missing data to tolerate.* <br>
**fillna**<br>
*Fill in missing data with some value or using an interpolation method such as "ffill" or "bfill".* <br>
**isna**<br>
*Return Boolean values indicating which values are missing/NA.* <br>
**notna**<br>
*Negation of isna, returns True for non_NA values and False for NA values.

### Filtering out missing data

In [11]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan],[np.nan, 6.5, 3.]])

In [15]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.standard_normal((7,3)))

In [18]:
df.iloc[:4, 1] = np.nan

In [19]:
df.iloc[:2, 2] = np.nan

In [20]:
df

Unnamed: 0,0,1,2
0,0.235133,,
1,-0.936567,,
2,1.850295,,-0.130762
3,-0.07796,,-0.337294
4,1.030803,0.640666,-0.247304
5,0.58077,0.885729,-0.484712
6,-0.484627,-0.187949,-1.10013


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,1.030803,0.640666,-0.247304
5,0.58077,0.885729,-0.484712
6,-0.484627,-0.187949,-1.10013


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.850295,,-0.130762
3,-0.07796,,-0.337294
4,1.030803,0.640666,-0.247304
5,0.58077,0.885729,-0.484712
6,-0.484627,-0.187949,-1.10013


# Filling in Missing Data 

In [23]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.235133,0.0,0.0
1,-0.936567,0.0,0.0
2,1.850295,0.0,-0.130762
3,-0.07796,0.0,-0.337294
4,1.030803,0.640666,-0.247304
5,0.58077,0.885729,-0.484712
6,-0.484627,-0.187949,-1.10013


Calling fillna with a dictionary, you can specify a different fill value for each column.

In [24]:
df.fillna({1: 0.5, 2:0})

Unnamed: 0,0,1,2
0,0.235133,0.5,0.0
1,-0.936567,0.5,0.0
2,1.850295,0.5,-0.130762
3,-0.07796,0.5,-0.337294
4,1.030803,0.640666,-0.247304
5,0.58077,0.885729,-0.484712
6,-0.484627,-0.187949,-1.10013


In [25]:
df = pd.DataFrame(np.random.standard_normal((6,3)))

In [26]:
df.iloc[2:, 1] = np.nan

In [27]:
df.iloc[4:, 2] = np.nan

In [28]:
df

Unnamed: 0,0,1,2
0,-0.646834,0.475169,0.867497
1,1.603687,-0.17236,-1.393133
2,1.450623,,-0.960726
3,1.719921,,0.224742
4,-0.723876,,
5,0.455784,,


In [29]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,-0.646834,0.475169,0.867497
1,1.603687,-0.17236,-1.393133
2,1.450623,-0.17236,-0.960726
3,1.719921,-0.17236,0.224742
4,-0.723876,-0.17236,0.224742
5,0.455784,-0.17236,0.224742


In [30]:
df.fillna(method='ffill', limit=2)

  df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,-0.646834,0.475169,0.867497
1,1.603687,-0.17236,-1.393133
2,1.450623,-0.17236,-0.960726
3,1.719921,-0.17236,0.224742
4,-0.723876,,0.224742
5,0.455784,,0.224742


In [31]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [32]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### fillna Function Arguments<br>
**value**<br>
*Scalar value or dictionary-like object to use to fill missing values* <br>
**method** <br>
*Interpolation method: one of 'bfill'(backward fill) of 'ffill' (forward fill); default is None.* <br>
**axis** <br>
*Axis to fill on ("index" or "columns"); default is axis= "index"* <br>
**limit** <br>
*For forward and backfilling, maximum number of consecutive periods to fill* 

# Data Transformation<br>
### Removing Duplicates

In [33]:
data = pd.DataFrame({"k1": ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})

In [34]:
data 

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [35]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [36]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [37]:
data['v1'] = range(7)

In [38]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [39]:
data.drop_duplicates(subset= ['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [40]:
data.drop_duplicates(['k1', 'k2'], keep= 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping 

In [41]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"], "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [42]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [43]:
meat_to_animal = {
    "bacon": "pig", 
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon"
}

In [44]:
data['animal'] = data['food'].map(meat_to_animal)

In [45]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [46]:
def get_animal(x):
    return meat_to_animal[x]

In [47]:
data['food'].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values
Replace is often simpler and more flexible than fillna.

In [48]:
data = pd.Series([1., -999., 2., -999, -1000., 3.])

In [49]:
data 

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

^ Here the -999. may be sentinal values for missing data. To replace these with NA values that pandas understands, we can use replace, producing a new Series:

In [50]:
data.replace([-999, np.nan])

  data.replace([-999, np.nan])


0       1.0
1       1.0
2       2.0
3       2.0
4   -1000.0
5       3.0
dtype: float64

In [51]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [52]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [53]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [54]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index = ["Ohio", "Colorado", "New York"],
                    columns = ["one", "two", "three", "four"])

In [55]:
def transform(x):
    return x[:4].upper()

In [56]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [57]:
data.index=data.index.map(transform)

In [58]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


data.rename(index=str.title, columns=str.upper)

In [59]:
data.rename(index={"OHIO": "INDIANA"}, columns={"three": "peekaboo"})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretizing and Binning

In [60]:
ages= [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [61]:
bins = [18, 25, 35, 60, 100]

In [62]:
age_categories = pd.cut(ages, bins)

In [63]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [64]:
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [65]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [66]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [67]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

^bin counts for pandas.cut.
In the string representation of an interval, a parenthesis means that the side is open (exclusive), while the square bracket means it is closed(inclusive). You can change which side is closed by passing right = False) v

In [68]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [69]:
group_names=["Youth", "YoungAdult", "MiddleAged", "Senior"]

You can override interval based bin labeling by passing a list or array to the labels option:

In [70]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

If you pass an integer number of bins to pandas.cut instead of explicit bin edges, it will compute equal length bins based on the minimum and maximum values in the data. Consider the case of some uniformly distributed data chopped into fourths:  

In [71]:
data = np.random.uniform(size=20)

In [72]:
pd.cut(data, 4, precision=2)

[(0.71, 0.93], (0.48, 0.71], (0.48, 0.71], (0.48, 0.71], (0.48, 0.71], ..., (0.26, 0.48], (0.034, 0.26], (0.48, 0.71], (0.48, 0.71], (0.48, 0.71]]
Length: 20
Categories (4, interval[float64, right]): [(0.034, 0.26] < (0.26, 0.48] < (0.48, 0.71] < (0.71, 0.93]]

For roughly similar sized bins, use qcut to take quantiles instead.

In [73]:
data = np.random.standard_normal(1000)

In [74]:
quartiles = pd.qcut(data, 4, precision=2)

In [75]:
quartiles

[(0.72, 3.2], (0.72, 3.2], (-0.67, 0.053], (0.053, 0.72], (-3.44, -0.67], ..., (0.72, 3.2], (-3.44, -0.67], (-3.44, -0.67], (0.053, 0.72], (-3.44, -0.67]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.44, -0.67] < (-0.67, 0.053] < (0.053, 0.72] < (0.72, 3.2]]

In [76]:
pd.Series(quartiles).value_counts()

(-3.44, -0.67]    250
(-0.67, 0.053]    250
(0.053, 0.72]     250
(0.72, 3.2]       250
Name: count, dtype: int64

In [77]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.428, -1.339]    100
(-1.339, 0.0532]    400
(0.0532, 1.324]     400
(1.324, 3.195]      100
Name: count, dtype: int64

### Detecting and Filtering Outliers

In [78]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [79]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.047825,0.025101,0.010911,0.046832
std,1.013851,0.989647,0.952345,1.029858
min,-3.024174,-3.453072,-3.066023,-3.21957
25%,-0.607839,-0.586295,-0.621791,-0.623955
50%,0.078141,0.033091,0.008885,0.076768
75%,0.712832,0.698075,0.617878,0.72119
max,3.12663,2.84776,2.877854,3.078451


In [80]:
col = data[2]

In [81]:
col[col.abs()>3]

340   -3.066023
Name: 2, dtype: float64

In [82]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
46,0.347506,0.672002,-1.487816,3.078451
186,-3.024174,-1.105626,0.085308,-0.370368
240,3.12663,2.149519,0.647268,-0.113137
306,0.041584,-3.195001,-0.797027,0.293904
340,0.291272,0.918136,-3.066023,-0.300794
477,0.108135,-0.304998,-0.518087,3.004283
520,-0.633246,1.030968,-1.058742,-3.21957
538,1.33444,-3.453072,-1.074267,0.557494
601,0.309705,-3.128137,1.311304,-0.047805
615,3.080766,-0.341623,0.837426,0.957897


In [83]:
data[data.abs() > 3] = np.sign(data)*3

In [84]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.047642,0.0263,0.010977,0.046969
std,1.013162,0.985748,0.952134,1.028944
min,-3.0,-3.0,-3.0,-3.0
25%,-0.607839,-0.586295,-0.621791,-0.623955
50%,0.078141,0.033091,0.008885,0.076768
75%,0.712832,0.698075,0.617878,0.72119
max,3.0,2.84776,2.877854,3.0


In [85]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,-1.0,1.0,1.0,-1.0
2,-1.0,1.0,1.0,-1.0
3,1.0,-1.0,1.0,1.0
4,-1.0,1.0,-1.0,-1.0


### Permutation and Random Sampling

In [86]:
df = pd.DataFrame(np.arange(5*7).reshape((5, 7)))

In [87]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [88]:
sampler = np.random.permutation(5)

In [89]:
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [90]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [91]:
column_sampler = np.random.permutation(7)

In [92]:
column_sampler

array([5, 2, 4, 6, 1, 3, 0])

In [93]:
df.take(column_sampler, axis="columns")

Unnamed: 0,5,2,4,6,1,3,0
0,5,2,4,6,1,3,0
1,12,9,11,13,8,10,7
2,19,16,18,20,15,17,14
3,26,23,25,27,22,24,21
4,33,30,32,34,29,31,28


In [94]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27


In [95]:
choices = pd.Series([5,7,-1,6,4])

In [96]:
choices.sample(n=10, replace=True)

1    7
0    5
3    6
4    4
3    6
1    7
1    7
1    7
1    7
0    5
dtype: int64

### Computing Indicator/Dummy Variable

In [97]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  "data1": range(6)})

In [98]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [99]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [100]:
dummies = pd.get_dummies(df['key'], prefix="key")

In [101]:
df_with_dummy = df[['data1']].join(dummies)

In [102]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


In [103]:
mnames = ["movie_id", "title", "genres"]

^Missing the movie data

In [104]:
np.random.seed(12345)

In [105]:
values = np.random.uniform(size=10)

In [106]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [107]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [108]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,False,False,False,False,True
1,False,True,False,False,False
2,True,False,False,False,False
3,False,True,False,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,True
7,False,False,False,True,False
8,False,False,False,True,False
9,False,False,False,True,False


### Extension Data Types

In [109]:
s = pd.Series([1,2,3, None])

In [110]:
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [111]:
s = pd.Series([1,2,3,None], dtype=pd.Int64Dtype())

In [112]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [113]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [114]:
s.dtype

Int64Dtype()

In [115]:
s[3]

<NA>

In [116]:
s[3] is pd.NA

True

In [117]:
s = pd.Series([1,2,3,None], dtype="Int64")

In [118]:
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())

In [119]:
s

0      one
1      two
2     <NA>
3    three
dtype: string

In [120]:
df = pd.DataFrame({'A': [1,2,None,4], 
                  'B': ['one', 'two', 'three', None],
                  'C': [False, None, False, True]})

In [121]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


In [122]:
df['A'] = df['A'].astype("Int64")

In [123]:
df['B'] = df['B'].astype("string")

In [124]:
df['C'] = df['C'].astype("boolean")

In [125]:
df

Unnamed: 0,A,B,C
0,1.0,one,False
1,2.0,two,
2,,three,False
3,4.0,,True


### Pandas Extension Data Types<br>

**BooleanDtype**<br>	
*Nullable Boolean data, use "boolean" when passing as string*<br><br>
**CategoricalDtype**	<br>
*Categorical data type, use "category" when passing as string* <br><br>
**DatetimeTZDtype**	<br>
*Datetime with time zone*<br><br>
**Float32Dtype**<br>
*32-bit nullable floating point, use "Float32" when passing as string*<br><br>
**Float64Dtype**<br>	
*64-bit nullable floating point, use "Float64" when passing as string*<br>
**Int8Dtype**	<br>
*8-bit nullable signed integer, use "Int8" when passing as string*<br><br>
**Int16Dtype**	<br>
*16-bit nullable signed integer, use "Int16" when passing as string*<br><br>
**Int32Dtype**	<br>
*32-bit nullable signed integer, use "Int32" when passing as string*<br><br>
**Int64Dtype**	<br>
*64-bit nullable signed integer, use "Int64" when passing as string*<br><br>
**UInt8Dtype**<br>
*8-bit nullable unsigned integer, use "UInt8" when passing as string*<br><br>
**UInt16Dtype**	<br>
*16-bit nullable unsigned integer, use "UInt16" when passing as string*<br><br>
**UInt32Dtype**	<br>
*32-bit nullable unsigned integer, use "UInt32" when passing as string*<br><br>
**UInt64Dtype**	<br>
*64-bit nullable unsigned integer, use "UInt64" when passing as string*

# String Manipulation
### Python Built-In String Object Methods

In [126]:
val = "a,b, guido"

In [127]:
val.split(",")

['a', 'b', ' guido']

In [128]:
pieces = [x.strip() for x in val.split(",")]

In [129]:
pieces

['a', 'b', 'guido']

In [130]:
first, second, third = pieces

In [131]:
first + "::" + second + "::" + third

'a::b::guido'

In [132]:
"::".join(pieces)

'a::b::guido'

In [133]:
"guido" in val 

True

In [134]:
val.index(",")

1

In [135]:
val.find(":")

-1

In [136]:
val.count(",")

2

In [137]:
val.replace(",", "::")

'a::b:: guido'

In [138]:
val.replace(",", "")

'ab guido'

### Python Built-In String Methods<br>
**count**	<br>
*Return the number of nonoverlapping occurrences of substring in the string*<br><br>
**endswith**<br>	
*Return True if string ends with suffix* <br><br>
**startswith**<br>	
*Return True if string starts with prefix* <br><br>
**join**<br>	
*Use string as delimiter for concatenating a sequence of other strings*<br><br>
**index**<br>	
*Return starting index of the first occurrence of passed substring if found in the string; otherwise, raises ValueError if not found*<br><br>
**find**<br>	
*Return position of first character of first occurrence of substring in the string; like index, but returns –1 if not found*<br><br>
**rfind**<br>	
*Return position of first character of last occurrence of substring in the string; returns –1 if not found*<br><br>
**replace**<br>	
*Replace occurrences of string with another string*<br><br>
**strip, rstrip, lstrip**<br>	
*Trim whitespace, including newlines on both sides, on the right side, or on the left side, respectively*<br><br>
**split**<br>	
*Break string into list of substrings using passed delimiter*<br><br>
**lower**<br>	
*Convert alphabet characters to lowercase*<br><br>
**upper**<br>	
*Convert alphabet characters to uppercase* <br><br>
**casefold**	<br>
*Convert characters to lowercase, and convert any region-specific variable character combinations to a common comparable form*<br><br>
**ljust, rjust**	<br>
*Left justify or right justify, respectively; pad opposite side of string with spaces (or some other fill character) to return a string with a minimum width*

### Regular Expressions

In [139]:
import re

In [140]:
text = "foo bar\t baz \tqux"

In [141]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [142]:
regex = re.compile(r"\s+")

In [143]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [144]:
regex.findall(text)

[' ', '\t ', ' \t']

In [145]:
text = """Dave dave@google.com 
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [146]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [147]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [148]:
m = regex.search(text)

In [149]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [150]:
text[m.start():m.end()]

'dave@google.com'

In [151]:
print(regex.match(text))

None


In [152]:
print(regex.sub("REDACTED", text))

Dave REDACTED 
Steve REDACTED
Rob REDACTED
Ryan REDACTED


In [153]:
pattern = r"([A-Z0-9._.%+-]+)@([A-Z09.-]+)\.([A-Z]{2,4})"

In [154]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [155]:
m = regex.match("wesm@bright.net")

In [156]:
m.groups()

('wesm', 'bright', 'net')

In [157]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [158]:
print(regex.sub(r"Username: \1. Domain: \2, Suffix: \3", text))

Dave Username: dave. Domain: google, Suffix: com 
Steve Username: steve. Domain: gmail, Suffix: com
Rob Username: rob. Domain: gmail, Suffix: com
Ryan Username: ryan. Domain: yahoo, Suffix: com


### Regular Expression Methods <br>
**findall**<br>
*Return all nonoverlapping matching patterns in a string as a list* <br><br>
**finditer** <br>
*Like findall, but returns an iterator*<br><br>
**match**<br>
Match pattern at start of string and optionally segment pattern components into groups; if the pattern matches, return a match object, and otherwise None.<br><br>
**search**<br>
*Scan string for match to pattern, returning a match object if so. Unlike match, the match can be anywhere in the string as opposed to only at the beginning.* <br><br>
**split** <br>
*Break string into pieces at each occurrence of pattern.* <br><br>
**sub, subn** <br>
*Replace all (sub) or first n occurences (subn) of pattern in string with replacement expression; use symbols \1, \2, ... to refer to match group elements in the replacement string.*

### String Functions in Pandas

In [159]:
data = {"Dave": 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [160]:
data = pd.Series(data)

In [161]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [162]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [163]:
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [164]:
data_as_string_ext = data.astype('string')

In [165]:
data_as_string_ext

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [166]:
data_as_string_ext.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [167]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [168]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [169]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]

In [170]:
matches

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [171]:
matches.str.get(1)


Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [172]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [173]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


### Partial List of Series String Methods<br>
**cat**<br>	*Concatenate strings element-wise with optional delimiter*<br><br>
**contains**<br>	*Return Boolean array if each string contains pattern/regex*<br><br>
**count**<br>	*Count occurrences of pattern*<br><br>
**extract**<br>	*Use a regular expression with groups to extract one or more strings from a Series of strings; the result will be a DataFrame with one column per group*<br><br>
**endswith**<br>	*Equivalent to x.endswith(pattern) for each element*<br><br>
**startswith**<br>	*Equivalent to x.startswith(pattern) for each element*<br><br>
**findall**	<br> *Compute list of all occurrences of pattern/regex for each string*<br><br>
**get**	<br> *Index into each element (retrieve i-th element)* <br><br>
**isalnum**<br>	*Equivalent to built-in str.alnum*<br><br>
**isalpha**<br>	*Equivalent to built-in str.isalpha*<br><br>
**isdecimal**<br>	*Equivalent to built-in str.isdecimal*<br><br>
**isdigit**	<br> *Equivalent to built-in str.isdigit*<br><br>
**islower**	<br> *Equivalent to built-in str.islower* <br><br>
**isnumeric**<br>	*Equivalent to built-in str.isnumeric* <br><br>
**isupper**	<br> *Equivalent to built-in str.isupper* <br><br>
**join**<br>	*Join strings in each element of the Series with passed separator*<br><br>
**len**	<br> *Compute length of each string*<br><br>
**lower, upper**<br>	*Convert cases; equivalent to x.lower() or x.upper() for each element*<br><br>
**match** <br> *Use re.match with the passed regular expression on each element, returning True or False whether it matches*<br><br>
**pad**<br> 	*Add whitespace to left, right, or both sides of strings*<br><br>
**center** <br> *Equivalent to pad(side="both")* <br><br>
**repeat**<br>	*Duplicate values (e.g., s.str.repeat(3) is equivalent to x * 3 for each string)* <br><br>
**replace** <br>	*Replace occurrences of pattern/regex with some other string* <br><br>
**slice**<br>	*Slice each string in the Series*<br><br>
**split**<br>	*Split strings on delimiter or regular expression* <br><br>
**strip**<br>	*Trim whitespace from both sides, including newlines* <br><br>
**rstrip**<br>	*Trim whitespace on right side* <br><br>
**lstrip**<br>	*Trim whitespace on left side* <br><br>

# Categorical Data<br>
### Background and Motivation

In [174]:
values = pd.Series(['apple', 'orange', 'apple', 'apple'] * 2)

In [176]:
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [177]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [180]:
pd.Series(values).value_counts()

apple     6
orange    2
Name: count, dtype: int64

In [181]:
values = pd.Series([0, 1, 0, 0] * 2)

In [182]:
dim = pd.Series(['apple', 'orange'])

In [183]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [184]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

### Categorical Extension Type

In [185]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [186]:
N = len(fruits)

In [187]:
rng = np.random.default_rng(seed=12345)

In [188]:
df = pd.DataFrame({'fruit': fruits,
                  'basket_id': np.arange(N),
                  'count': rng.integers(3,15,size=N),
                  'weight': rng.uniform(0,4,size=N)},
                 columns=['basket_id', 'fruit', 'count', 'weight'])

In [189]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [190]:
fruit_cat = df['fruit'].astype('category')

In [191]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [192]:
c = fruit_cat.array

In [193]:
type(c)

pandas.core.arrays.categorical.Categorical

In [194]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [195]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [197]:
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}

In [198]:
df['fruit'] = df['fruit'].astype('category')

In [200]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [201]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])

In [202]:
my_categories

['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']

In [203]:
categories = ['foo', 'bar', 'baz']

In [204]:
codes = [0,1,2,0,0,1]

In [205]:
my_cats_2 = pd.Categorical.from_codes(codes, categories)

Unless specified, categorical conversions assume no specific ordering of the categories. 

In [206]:
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']

When using from_codes or any other constructors, you can indicate that the categories have a meaningful ordering. 

In [207]:
ordered_cat = pd.Categorical.from_codes(codes, categories, ordered=True)

In [208]:
ordered_cat

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [209]:
my_cats_2.as_ordered()

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

In [210]:
rng = np.random.default_rng(seed=12345)

In [211]:
draws = rng.standard_normal(1000)

In [212]:
draws[:5]

array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331])

In [213]:
bins = pd.qcut(draws, 4)

In [214]:
bins

[(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]

In [218]:
bins = pd.qcut(draws, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4' ])

In [219]:
bins

['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [220]:
bins.codes[:10]

array([0, 3, 0, 1, 1, 0, 0, 2, 2, 0], dtype=int8)

In [221]:
bins = pd.Series(bins, name= 'quartile')

In [223]:
results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())

  results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())


In [224]:
results 

Unnamed: 0,quartile,count,min,max
0,Q1,250,-3.119609,-0.678494
1,Q2,250,-0.673305,0.008009
2,Q3,250,0.018753,0.686183
3,Q4,250,0.688282,3.211418


In [225]:
results['quartile']

0    Q1
1    Q2
2    Q3
3    Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

#### Better Performance with Categoricals

In [226]:
N = 10_000_000

In [227]:
labels = pd.Series(['foo', 'bar', 'baz', 'qux']*(N//4))

In [228]:
categories = labels.astype('category')

In [229]:
labels.memory_usage(deep= True)

600000128

In [231]:
categories.memory_usage(deep= True)

10000540

In [236]:
%time _ = labels.astype('category')

CPU times: user 153 ms, sys: 12.2 ms, total: 166 ms
Wall time: 166 ms


In [237]:
%timeit labels.value_counts()

155 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [238]:
%timeit categories.value_counts()

20.4 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Categorical Methods

In [239]:
s = pd.Series(["a", "b", "c", "d"]*2)

In [241]:
cat_s = s.astype('category')

In [242]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [243]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [244]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [245]:
actual_categories = ["a","b","c", "d", "e"]

In [246]:
cat_s2 = cat_s.cat.set_categories(actual_categories)

In [247]:
cat_s2

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']

In [248]:
cat_s.value_counts()

a    2
b    2
c    2
d    2
Name: count, dtype: int64

In [249]:
cat_s2.value_counts()

a    2
b    2
c    2
d    2
e    0
Name: count, dtype: int64

In [250]:
cat_s3 = cat_s[cat_s.isin(['a', 'b'])]

In [253]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [254]:
cat_s3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): ['a', 'b']

### Categorical Methods for Series in Pandas  <br>
**add_categories** <br>	*Append new (unused) categories at end of existing categories* <br> <br>
**as_ordered** <br> *Make categories ordered* <br> <br>
**as_unordered** <br>	*Make categories unordered*  <br> <br>
**remove_categories** <br>	*Remove categories, setting any removed values to null* <br> <br>
**remove_unused_categories** <br>	*Remove any category values that do not appear in the data*  <br> <br>
**rename_categories** <br>	*Replace categories with indicated set of new category names; cannot change the number of categories* <br> <br>
**reorder_categories** <br>	*Behaves like rename_categories, but can also change the result to have ordered categories* <br> <br>
**set_categories** <br>	*Replace the categories with the indicated set of new categories; can add or remove categories*

### Dummy Variables/One-Hot Encoding

In [255]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype= 'category')

In [256]:
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,True,False,False,False
5,False,True,False,False
6,False,False,True,False
7,False,False,False,True
