In [None]:
import sys
import timeit
import random

import numpy as np
import pandas as pd

from utils import list_methods

## Series

- Basic building block in pandas
- Vector; 1 dimensional collection of objects

### Construction and data types

In [None]:
# Construct Series from list of integers.
# Note: automatically converts to numpy int64 data type.
s = pd.Series([1, 2, 3, 4])
s

In [None]:
# Elements must all be of the same data type.
# Note: first string value triggers (widening) cast to object data type.
sobj = pd.Series(["1", 2, 3, 4])
sobj

In [None]:
# Big difference in memory usage...
print(f"Size s:     {sys.getsizeof(s) * 8:10d} bits.")
print(f"Size sobj:  {sys.getsizeof(sobj) * 8:10d} bits.")
print(f"Difference: {(sys.getsizeof(sobj) - sys.getsizeof(s)) * 8:10d} bits.")

In [None]:
# Categorical data type to the rescue!
n = 10
sobj = pd.Series(np.random.choice(["ja", "nee"], n))
scat = pd.Categorical(sobj)

In [None]:
# Categorical saves a lot of memory.
print(f"Size sobj:  {sys.getsizeof(sobj) * 8:10d} bits.")
print(f"Size scat:  {sys.getsizeof(scat) * 8:10d} bits.")
print(f"Difference: {(sys.getsizeof(sobj) - sys.getsizeof(scat)) * 8:10d} bits.")

In [None]:
# Note: memory saved depends a lot on cardinality; high cardinality corresponds to lower memory saving.

In [None]:
# Missing values can also cause implicit (widening) cast (int64 => float64).
# Casts are caused by the fact that np.nan is float type.
sf64 = pd.Series([1, 2, 3, 4, None])
sf64

In [None]:
# Newer versions of pandas implement pandas.NA (experimental):
# https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html#integer-na.

In [None]:
# Using a smaller data type saves memory.
sf16 = pd.Series([1, 2, 3, 4, None], dtype="float16")
sf16

In [None]:
# Difference for two series of 5 float values.
print(f"Size 64 bit: {sys.getsizeof(sf64) * 8:10d} bits.")
print(f"Size 16 bit: {sys.getsizeof(sf16) * 8:10d} bits.")
print(f"Difference: {(sys.getsizeof(sf64) - sys.getsizeof(sf16)) * 8:10d} bits.")

In [None]:
# Matches expected reduction for 5 items.
5 * (64 - 16)

## Compared to lists

In [None]:
# Lists do not support many common math operations...
[1, 2, 3, 4] + 1

In [None]:
def list_add(list_, value):
    """Add scalar value to list elements."""

    return [_ + value for _ in list_]

In [None]:
# Can build helper functions to solve these issues, but cumbersome...
list_add([1, 2, 3, 4], 1)

In [None]:
# Series support math operators as expected
s = pd.Series([1, 2, 3, 4])

s + 1

In [None]:
# Multiplication: Counter-intuitive results for lists...
[1, 2, 3] * 2

In [None]:
# Multiplication: Also dependent on operands...
[1, 2, 3] * [1, 2, 3]

In [None]:
def list_mult(list_, multiplier):
    """Element-wise multiplication for lists."""

    if isinstance(multiplier, (int, float)):
        return [_ * multiplier for _ in list_]
    return [x * y for x, y in zip(list_, multiplier)]

In [None]:
# Another (ugly) work-around with a helper function...
list_mult([1, 2, 3], 2)

In [None]:
list_mult([1, 2, 3], [1, 2, 3])

In [None]:
# Much easier with a pandas Series...
s1 = pd.Series([1, 2, 3])
s2 = pd.Series([1, 2, 3])

s1 * 2

In [None]:
s1 * s2

## Speed comparison

In [None]:
# Series are faster than lists!

# Define increasing sample sizes
sizes = 1_000, 10_000, 100_000, 500_000, 1_000_000

times = []
n_exec = 100
pop = [random.random() for _ in range(sizes[-1])]
for n in sizes:

    # Construct a list and series.
    # Note: Setup is excluded from the equation.
    l = pop[0:n]
    s = pd.Series(l)
    
    times.append(({
        "sample_size": n,
        
        # sum(list)
        "sum(list)": timeit.timeit("sum(l)", globals=globals(), number=n_exec),
        
        # Series.sum()
        "Series.sum()": timeit.timeit("s.sum()", globals=globals(), number=n_exec),
        
        # sum(Series)
        # Note: worst of both worlds, forces conversion from numpy back to python
        # "sum(Series)": timeit.timeit("sum(s)", globals=globals(), number=n_exec),
    }))

In [None]:
# Plot execution times
ax = (
    pd.DataFrame(times)
    .plot(
        x="sample_size",
        figsize=(12, 8),
        marker="o",       
    )
)
ax.set_title(f"List versus Series - {n_exec} runs")
ax.set_xlabel("Sample size")
ax.set_ylabel("Execution time (seconds)")

## Indices

In [None]:
# By default Series have a numeric index (RangeIndex).
s1 = pd.Series(list("ABCD"))
s1

In [None]:
s1.index

In [None]:
# Use index argument to set a custom index.
s2 = pd.Series(list("ABCD"), index=list("abcd"))
s2

In [None]:
# Slice using row numbers.
s2[0:3]

In [None]:
# Slice using the index values.
s2["a":"c"]

In [None]:
# But... What if you are using a RangeIndex?
s1.index

In [None]:
# Ambiguous: Does this use index labels or row numbers???
s1[0:3]

### Using loc and iloc

In [None]:
# Use .loc[] for index slicing.
# Note: index slicing includes upper limit.
s1.loc[0:3]

In [None]:
# Use .iloc[] for row number slicing
# Note: row number slicing excludes upper limit...
s1.iloc[0:3]

### Computation and indices

In [None]:
# Computation uses matched indices
a = pd.Series([1, 2, 3, 5], index=[1, 2, 3, 5])
b = pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4])

In [None]:
# Note the missing values here caused by the different indices of a and b!
a * b

## Common methods

Series support many methods, the section below only shows some very common ones.

### Numeric series

In [None]:
numeric = pd.Series(np.random.normal(0, 1, 10))

In [None]:
# Print descriptive statistics appropriate for numeric data.
numeric.describe()

In [None]:
# Common aggregation methods
for method in "count", "mean", "std", "min", "median", "max":
    print(f"numeric.{method + '()':10s} {getattr(numeric, method)():>10.3f}")

In [None]:
# Support for common mathematical operations
numeric.abs()

In [None]:
# Ranking of values
numeric.rank()

### Categorical series

In [None]:
categorical = pd.Series(np.random.choice(list("ABCD"), 10))

In [None]:
# Print descriptives appropriate for categorical data.
categorical.describe()

In [None]:
# Row counts per category.
# Note: returns a Series, which is sorted by values by default.
categorical.value_counts()

In [None]:
# Get unique category values.
categorical.unique()

In [None]:
# Number of unique category values.
categorical.nunique()

In [None]:
# Find modal value(s).
# Note: returns a Series as there can be multiple modes!
categorical.mode()

In [None]:
# Use replace method to recode category values.
# Note: categories not in the mapping are left as-is.
categorical.replace({
    "A": "a",
    "B": "b",
    "C": "c",
})

In [None]:
# Using map is an alternative.
# Note: missing categories now turn into missing values!
categorical.map({
    "A": "a",
    "B": "b",
    "C": "c",
})

### Missing values

In [None]:
s = pd.Series([1, 2, None, 4, None, None, 7, 8, 9])

In [None]:
# Check for missing values.
s.hasnans

In [None]:
# Missing or not (see also: s.notna()).
s.isna()

In [None]:
# Count the missing values.
# Note: for fraction / percentage, use .mean() instead.
s.isna().sum()

In [None]:
# Filling missing values with series mean.
s.fillna(s.mean())

In [None]:
# Forward fill copies last good value
# Note: sort order matters here of course!
s.ffill()

In [None]:
# Interpolation (default: linear interpolation)
s.interpolate()

In [None]:
# Drop missing values altogether
s.dropna()

### Custom functions

In [None]:
# Fake financial data
financial = pd.Series(np.random.exponential(10_000, 10))
financial

In [None]:
import locale
locale.setlocale(locale.LC_ALL, "nl_NL")

def format_financial(value):
    """Format value as a currency given the current locale."""
    
    return locale.currency(value, grouping=True)

In [None]:
# Use .map() to apply your custom function to all values
# Note: your function should process a single value per call.
financial.map(format_financial)

### All methods of Series

In [None]:
# Run the code below to view all available methods.
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

# Uncomment for spam!
# list_methods(pd.Series)