In [1]:
# import pandas
import pandas as pd
import numpy as np

In [2]:
# Fill missing values

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado']) # Series with missing values
string_data
print(string_data.isnull()) # Check for missing values
string_data[0] = None # Set first value to None
print(string_data.isnull()) # Check for missing values

0    False
1    False
2     True
3    False
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool


# Methods for filling missing values

| Method        | Description                | Example                     |
|---------------|----------------------------|-----------------------------|
| `fillna`      | Fill missing values        | `df.fillna(0)`              |
| `ffill`       | Fill forward               | `df.fillna(method='ffill')` |
| `bfill`       | Fill backward              | `df.fillna(method='bfill')` |
| `dropna`      | Drop missing values        | `df.dropna()`               |
| `interpolate` | Interpolate missing values | `df.interpolate()`          |
| `replace`     | Replace missing values     | `df.replace(-999, np.nan)`  |
| `mask`        | Replace values with NaN    | `df.mask(df < 0)`           |
| `notnull`     | Negation of isnull         | `df.notnull()`              |

In [3]:
# N/A filtering 
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
print(data.dropna()) # Drop missing values
print(data[data.notnull()]) # Negation of isnull

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna() # Drop missing values
data.dropna(how='all') # Drop rows with all missing values
data.dropna(axis=1, how='all') # Drop columns with all missing values
data.dropna(thresh=2) # Drop rows with at least 2 missing values
df = pd.DataFrame(np.random.randn(7, 3))

0    1.0
2    3.5
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64


# Dropna method arguments

| Argument | Description                  | Example                 |
|----------|------------------------------|-------------------------|
| `axis`   | Axis to drop values          | `df.dropna(axis=1)`     |
| `how`    | How to drop values           | `df.dropna(how='all')`  |
| `thresh` | Minimum non-NA values        | `df.dropna(thresh=2)`   |
| `subset` | Drop values in specific rows | `df.dropna(subset=[1])` |

In [4]:
# NA Filling missing data

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

print(df.fillna(0)) # Fill missing values with 0
print(df.fillna({1: 0.5, 2: 0})) # Fill missing values with different values
print(df.fillna(method='ffill')) # Fill missing values with forward fill
print(df.fillna(method='ffill', limit=2)) # Fill missing values with forward fill with limit

          0         1         2
0 -0.585422  0.000000  0.000000
1 -0.452200  0.000000  0.000000
2 -0.094434  0.000000  2.017499
3 -0.061829  0.000000  1.788029
4 -0.601111  0.383341 -0.143167
5  0.867600 -1.543485  0.811456
6  1.174503  1.321020  0.094375
          0         1         2
0 -0.585422  0.500000  0.000000
1 -0.452200  0.500000  0.000000
2 -0.094434  0.500000  2.017499
3 -0.061829  0.500000  1.788029
4 -0.601111  0.383341 -0.143167
5  0.867600 -1.543485  0.811456
6  1.174503  1.321020  0.094375
          0         1         2
0 -0.585422       NaN       NaN
1 -0.452200       NaN       NaN
2 -0.094434       NaN  2.017499
3 -0.061829       NaN  1.788029
4 -0.601111  0.383341 -0.143167
5  0.867600 -1.543485  0.811456
6  1.174503  1.321020  0.094375
          0         1         2
0 -0.585422       NaN       NaN
1 -0.452200       NaN       NaN
2 -0.094434       NaN  2.017499
3 -0.061829       NaN  1.788029
4 -0.601111  0.383341 -0.143167
5  0.867600 -1.543485  0.811456
6  1.174

  print(df.fillna(method='ffill')) # Fill missing values with forward fill
  print(df.fillna(method='ffill', limit=2)) # Fill missing values with forward fill with limit


In [5]:
data = pd.Series([1., NA, 3.5, NA, 7])
print(data.fillna(data.mean())) # Fill missing values with mean

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64


# Fillna method arguments

| Argument  | Description               | Example                     |
|-----------|---------------------------|-----------------------------|
| `value`   | Scalar value or dict-like | `df.fillna(0)`              |
| `method`  | Interpolation method      | `df.fillna(method='ffill')` |
| `axis`    | Axis to fill values       | `df.fillna(axis=1)`         |
| `limit`   | Maximum number of fills   | `df.fillna(limit=2)`        |
| `inplace` | Modify the calling object | `df.fillna(inplace=True)`   |

In [6]:
# Data transformation

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)
data.duplicated() # Check for duplicates
data.drop_duplicates() # Drop duplicates
data['v1'] = range(7)
print(data.drop_duplicates(['k1'])) # Drop duplicates in specific columns
data.drop_duplicates(['k1', 'k2'], keep='last') # Drop duplicates and keep last

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4
    k1  k2  v1
0  one   1   0
1  two   1   1


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [7]:
# Data transformation with function mapping

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

data['animal'] = data['food'].str.lower().map(meat_to_animal)

lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)

data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [8]:
# Data transformation with replacing values

data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace(-999, np.nan) # Replace single value
data.replace([-999, -1000], np.nan) # Replace multiple values
data.replace([-999, -1000], [np.nan, 0]) # Replace multiple values with different values
data.replace({-999: np.nan, -1000: 0}) # Replace multiple values with different values

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [9]:
# Data transformation with renaming axis indexes

data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper() # Transform function for index

data.index.map(transform) # Transform index
data.index = data.index.map(transform) # Assign transformed index
data.rename(index=str.title, columns=str.upper) # Rename index and columns
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'}) # Rename index and columns
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'}, inplace=True) # Rename index and columns in place

In [10]:
# Data discretization and binning

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] # Ages
bins = [18, 25, 35, 60, 100] # Bins
cats = pd.cut(ages, bins) # Discretize ages into bins
print(cats)
print(cats.codes)
print(cats.categories)
pd.Series(cats).value_counts() # Count values in bins
pd.cut(ages, [18, 26, 36, 61, 100], right=False) # Change side of interval

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] # Group names
pd.cut(ages, bins, labels=group_names) # Assign group names

data = np.random.rand(20) # Random data
pd.cut(data, 4, precision=2) # Discretize data into 4 bins

data = np.random.randn(1000) # Random data
cats = pd.qcut(data, 4) # Quantile-based discretization
result = pd.Series(cats).value_counts() # Count values in bins
print(result)   

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) # Quantile-based discretization with custom quantiles


[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
[0 0 0 1 0 0 2 1 3 2 2 1]
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
(-3.407, -0.701]     250
(-0.701, 0.00716]    250
(0.00716, 0.674]     250
(0.674, 3.406]       250
Name: count, dtype: int64


[(0.00716, 1.32], (0.00716, 1.32], (0.00716, 1.32], (-1.25, 0.00716], (0.00716, 1.32], ..., (0.00716, 1.32], (-1.25, 0.00716], (0.00716, 1.32], (-3.407, -1.25], (0.00716, 1.32]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.407, -1.25] < (-1.25, 0.00716] < (0.00716, 1.32] < (1.32, 3.406]]

In [11]:
# Detecting and filtering outliers

data = pd.DataFrame(np.random.randn(1000, 4)) # Random data
data.describe() # Describe data
print(data)

col = data[2] # Select column
col[np.abs(col) > 3] # Detect outliers

# outliers = data[(np.abs(data) > 3).any(1)] # Filter rows with outliers
# print(outliers)

data[np.abs(data) > 3] = np.sign(data) * 3 # Cap outliers
print(data)
np.sign(data).head() # Compute sign of each element

            0         1         2         3
0    0.400821  0.768142 -0.883975 -0.904884
1   -0.535884  0.267236  1.015331 -0.635472
2   -1.829197 -0.701704 -0.675029 -0.059943
3   -0.063197  0.104394  0.413716 -1.091740
4   -0.961904  0.315508 -1.558283  0.653537
..        ...       ...       ...       ...
995  0.484984 -0.554324 -0.874425 -1.396611
996 -0.305883 -0.726839 -0.568515  0.645232
997  0.670504  0.303797  1.344536  1.567705
998 -0.194456  1.032228  0.473385  0.413028
999  0.776214  0.155965  0.678151 -0.511930

[1000 rows x 4 columns]
            0         1         2         3
0    0.400821  0.768142 -0.883975 -0.904884
1   -0.535884  0.267236  1.015331 -0.635472
2   -1.829197 -0.701704 -0.675029 -0.059943
3   -0.063197  0.104394  0.413716 -1.091740
4   -0.961904  0.315508 -1.558283  0.653537
..        ...       ...       ...       ...
995  0.484984 -0.554324 -0.874425 -1.396611
996 -0.305883 -0.726839 -0.568515  0.645232
997  0.670504  0.303797  1.344536  1.567705
998 -0.

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,-1.0
1,-1.0,1.0,1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,1.0,-1.0,1.0


In [12]:
# Permutation and random sampling

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4))) # Random data
sampler = np.random.permutation(5) # Permutation
df.take(sampler) # Permute rows
df.sample(n=3) # Random sampling without replacement
choices = pd.Series([5, 7, -1, 6, 4]) # Random data
draws = choices.sample(n=10, replace=True) # Random sampling with replacement
print(draws)

1    7
3    6
3    6
2   -1
0    5
0    5
4    4
2   -1
3    6
0    5
dtype: int64


In [13]:
# Computing indicator/dummy variables

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)}) # Random data
print(df)
pd.get_dummies(df['key']) # Create dummy variables
dummies = pd.get_dummies(df['key'], prefix='key') # Create dummy variables with prefix
print(dummies)
df_with_dummy = df[['data1']].join(dummies) # Join dummy variables
print(df_with_dummy)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5
   key_a  key_b  key_c
0  False   True  False
1  False   True  False
2   True  False  False
3  False  False   True
4   True  False  False
5  False   True  False
   data1  key_a  key_b  key_c
0      0  False   True  False
1      1  False   True  False
2      2   True  False  False
3      3  False  False   True
4      4   True  False  False
5      5  False   True  False


In [14]:
# MovieLens 1M dataset

mnames = ['movie_id', 'title', 'genres'] # Movie data
movies = pd.read_table('../../dataset/grouplens_1m_movies.zip', sep='::', header=None, names=mnames, engine='python', encoding='latin1')
print(movies.head())

all_genres = [] # All genres
for x in movies.genres:
    all_genres.extend(x.split('|'))
    
genres = pd.Series(all_genres).unique() # Unique genres
print(genres)

zero_matrix = np.zeros((len(movies), len(genres))) # Zero matrix
print(zero_matrix)

dummy = pd.DataFrame(zero_matrix, columns=genres) # Dummy variables

gen = movies.genres[0] # First genre
print(gen.split('|'))
print(dummy.columns.get_indexer(gen.split('|')))

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
['Animation' "Children's" 'Comedy' 'Adventure' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['Animation', "Children's", 'Comedy']
[0 1 2]


In [15]:
for idx, gen in enumerate(movies.genres): # Iterate over genres
    indices = dummy.columns.get_indexer(gen.split('|')) # Get indices
    dummy.iloc[idx, indices] = 1 # Set values
    
movies_windic = movies.join(dummy.add_prefix('Genre_')) # Join dummy variables
print(movies_windic.iloc[0]) # Print first row

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir                              0.0
Genre_Western       

In [16]:
np.random.seed(12345) # Random seed
values = np.random.rand(10) # Random values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1] # Bins
pd.get_dummies(pd.cut(values, bins)) # Create dummy variables
print(pd.get_dummies(pd.cut(values, bins)))

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0       False       False       False       False        True
1       False        True       False       False       False
2        True       False       False       False       False
3       False        True       False       False       False
4       False       False        True       False       False
5       False       False        True       False       False
6       False       False       False       False        True
7       False       False       False        True       False
8       False       False       False        True       False
9       False       False       False        True       False


In [None]:
# String manipulation

val = 'a,b,  guido' # String
print(val.split(',')) # Split string
pieces = [x.strip() for x in val.split(',')] # Strip spaces 
print(pieces)
first, second, third = pieces # Unpack values
print(first + '::' + second + '::' + third) # Concatenate values
print('::'.join(pieces)) # Join values

print('guido' in val) # Check for substring
val.index(',') # Find substring
val.find(':') # Find substring
val.count(',') # Count occurrences
val.replace(',', '::') # Replace substring
val.replace(',', '') # Replace substring

# Built-in string methods

| Method        | Description                | Example                     |
|---------------|----------------------------|-----------------------------|
| `split`       | Split strings              | `val.split(',')`            |
| `strip`       | Strip whitespace           | `x.strip()`                 |
| `join`        | Join strings               | `'::'.join(pieces)`         |
| `in`          | Check for substring        | `'guido' in val`            |
| `index`       | Find substring             | `val.index(',')`            |
| `find`        | Find substring             | `val.find(':')`             |
| `count`       | Count occurrences          | `val.count(',')`            |
| `replace`     | Replace substring          | `val.replace(',', '::')`    |

In [19]:
# Regular expressions

import re

text = "foo    bar\t baz  \tqux" # Text
print(text)
re.split('\s+', text) # Split text
regex = re.compile('\s+') # Compile regex
regex.split(text) # Split text
regex.findall(text) # Find all occurrences

foo    bar	 baz  	qux


['    ', '\t ', '  \t']

In [20]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' # Pattern
regex = re.compile(pattern, flags=re.IGNORECASE) # Compile regex
all_mails = regex.findall(text) # Find all occurrences
print(all_mails)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']


In [None]:
m = regex.search(text) # Search first occurrence
print(m)

print(regex.match(text)) # Match first occurrence
print(regex.sub('REDACTED', text)) # Substitute occurrences

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' # Pattern
regex = re.compile(pattern, flags=re.IGNORECASE) # Compile regex
m = regex.match('web@bright.com') # Match
print(m.groups())
print(regex.findall(text)) # Find all occurrences
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # Substitute occurrences

# Regex methods

| Method     | Description             | Example                       |
|------------|-------------------------|-------------------------------|
| `findall`  | Find all occurrences    | `regex.findall(text)`         |
| `search`   | Search first occurrence | `regex.search(text)`          |
| `match`    | Match first occurrence  | `regex.match(text)`           |
| `sub`      | Substitute occurrences  | `regex.sub('REDACTED', text)` |
| `groups`   | Return groups           | `m.groups()`                  |
| `compile`  | Compile regex           | `re.compile(pattern)`         |
| `finditer` | Find all occurrences    | `regex.finditer(text)`        |

In [None]:
# Vectorized string functions in pandas

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan} # Data
data = pd.Series(data) # Series
print(data)

print(data.isnull()) # Check for missing values
print(data.str.contains('gmail')) # Check for substring
print(data.str.findall(pattern, flags=re.IGNORECASE)) # Find all occurrences
print(data.str.match(pattern, flags=re.IGNORECASE)) # Match first occurrence

# Vectorized string methods in pandas DataFrame

| Method          | Description                 | Example                             |
|-----------------|-----------------------------|-------------------------------------|
| `split`         | Split strings               | `df.str.split(',')`                 |
| `strip`         | Strip whitespace            | `df.str.strip()`                    |
| `join`          | Join strings                | `df.str.join(pieces)`               |
| `contains`      | Check for substring         | `df.str.contains('gmail')`          |
| `findall`       | Find all occurrences        | `df.str.findall(pattern)`           |
| `match`         | Match first occurrence      | `df.str.match(pattern)`             |
| `replace`       | Replace substring           | `df.str.replace(',', '::')`         |
| `extract`       | Extract substrings          | `df.str.extract(pattern)`           |
| `len`           | Compute string length       | `df.str.len()`                      |
| `lower`         | Convert to lowercase        | `df.str.lower()`                    |
| `upper`         | Convert to uppercase        | `df.str.upper()`                    |
| `capitalize`    | Capitalize strings          | `df.str.capitalize()`               |
| `title`         | Titlecase strings           | `df.str.title()`                    |
| `startswith`    | Check for start of string   | `df.str.startswith('foo')`          |
| `endswith`      | Check for end of string     | `df.str.endswith('bar')`            |
| `find`          | Find position of substring  | `df.str.find('foo')`                |
| `rfind`         | Find last position          | `df.str.rfind('foo')`               |
| `count`         | Count occurrences           | `df.str.count('foo')`               |
| `isalnum`       | Check for alphanumeric      | `df.str.isalnum()`                  |
| `isalpha`       | Check for alphabetic        | `df.str.isalpha()`                  |
| `isdecimal`     | Check for decimal           | `df.str.isdecimal()`                |
| `isdigit`       | Check for digits            | `df.str.isdigit()`                  |
| `isspace`       | Check for whitespace        | `df.str.isspace()`                  |
| `islower`       | Check for lowercase         | `df.str.islower()`                  |
| `isupper`       | Check for uppercase         | `df.str.isupper()`                  |
| `istitle`       | Check for titlecase         | `df.str.istitle()`                  |
| `isnumeric`     | Check for numeric           | `df.str.isnumeric()`                |
| `isidentifier`  | Check for identifier        | `df.str.isidentifier()`             |
| `isprintable`   | Check for printable         | `df.str.isprintable()`              |
| `zfill`         | Pad strings with zeros      | `df.str.zfill(10)`                  |
| `strip`         | Strip whitespace            | `df.str.strip()`                    |
| `rstrip`        | Strip whitespace from right | `df.str.rstrip()`                   |
| `lstrip`        | Strip whitespace from left  | `df.str.lstrip()`                   |
| `rjust`         | Right-justify strings       | `df.str.rjust(10)`                  |
| `ljust`         | Left-justify strings        | `df.str.ljust(10)`                  |
| `center`        | Center strings              | `df.str.center(10)`                 |
| `pad`           | Pad strings                 | `df.str.pad(10)`                    |
| `wrap`          | Wrap strings                | `df.str.wrap(10)`                   |
| `slice`         | Slice strings               | `df.str.slice(0, 5)`                |
| `slice_replace` | Replace slice               | `df.str.slice_replace(0, 5, 'foo')` |
| `cat`           | Concatenate strings         | `df.str.cat()`                      |
| `repeat`        | Repeat strings              | `df.str.repeat(3)`                  |
| `normalize`     | Normalize strings           | `df.str.normalize()`                |
| `partition`     | Partition strings           | `df.str.partition('foo')`           |
| `rpartition`    | Reverse partition strings   | `df.str.rpartition('foo')`          |
