In [19]:
import sys
import datetime as dt

import numpy as np
import pandas as pd

# Pandas vs Python

In [None]:
# Create Series of integers.
x = list(range(100_000))
series = pd.Series(x)
series.shape

In [None]:
# Performance when using a Python list.
%timeit -n 25 sum(x)

In [None]:
# Performance using pandas function.
# Note: Data is processed in numpy.
%timeit -n 25 series.sum()

In [None]:
# Using Python's sum() function forces a slow conversion...
%timeit -n 25 sum(series)

# Series

In [None]:
# List of integers, data type turns into `int64`
s = pd.Series([1, 2, 3])
s

In [None]:
# Adding a string value changes Series to type `object`.
s = pd.Series([1, 2, 3, "A"])
s

In [None]:
print("Series data type: ", s.dtype)
for _ in s:
    print(f"Data type value: {_} = {type(_)}")

In [None]:
n = 10

# Use of Categorical type is often more efficient.
values = ["A", "B", "C"] * n
obj = pd.Series(values)
cat = pd.Series(values, dtype="category")

# Compare memory footprint.
print("Size object type:      ", sys.getsizeof(obj))
print("Size categorical type: ", sys.getsizeof(cat))

## Type Casting 

In [None]:
# Series of floating point numbers.
s = pd.Series([1.1, 2.2, 3.3])
s

In [None]:
# Cast to integer using astype() method.
# Note: Drops decimal positions like normal int() conversion.
s.astype(int)

In [None]:
# ValueError when conversion is impossible.
# Note: Similar to Python cast.
pd.Series(["A", "B", "C"]).astype(int)

## Descriptives

In [48]:
# Generate dummy data.
n = 150
cat = pd.Series(np.random.choice(["A","B", "C", "D"], n))
num = pd.Series(np.random.normal(0, 1, n))

In [None]:
# Descriptives for numerical data.
num.describe()

In [None]:
# Or use aggragation functions...
print(f"Mean:   {num.mean()   :8.3f}")
print(f"Median: {num.median() :8.3f}")
print(f"SD:     {num.std()    :8.3f}")

In [None]:
# Use the .plot interface to create a histogram
(
    num
    .plot
    .hist(
        bins=50,
        edgecolor="white",
        figsize=(6, 3)
    )
)

In [None]:
# Descriptives for categorical data.
cat.describe()

In [None]:
# Unique values and number of unique values.
print(f"Unique values:  {cat.unique()}")
print(f"Unique count:   {cat.nunique()}")

In [None]:
# Category frequencies.
cat.value_counts()

In [None]:
# Plotting category frequencies.
# Note: value_counts() returns a Series with the plot interface.
(
    cat
    .value_counts()
    .plot
    .barh(
        figsize=(6, 3)
    )
)

In [None]:
# Use sort_values() method to reverse the order.
(
    cat
    .value_counts()
    .sort_values(ascending=True)
    .plot
    .barh(
        figsize=(6, 3)
    )
)

## Data Manipulation

In [57]:
num = pd.Series([-2, -1, 0, 1, 2])

In [None]:
# Series support indexing.
num[0]

In [None]:
# And slicing.
num[1:4]

In [None]:
# Use assignment to modify values.
# Note: Assignment caused an automatic cast to float64.
num[0] = -3.5
num

In [None]:
# Apply mathematical operations to a Series.
num ** 2 / 2

In [None]:
# Custom function.
def abs_sqrt(x):
    return abs(x) ** 0.5


# Apply function via .map() method.
num.map(abs_sqrt)

In [None]:
# Alternative: use a lambda function.
num.map(lambda x: abs(x) ** 0.5)

# Indices en index types

In [None]:
# Use .index property to access the index.
# Note: Automatically defaults to a RangeIndex.
scores = pd.Series([1, 2, 3, 4])
scores.index

In [None]:
# Use numeric indexing.
scores[1]

In [None]:
# Use index parameter to provide your own index.
# Note: Deafult RangeIndex is replaced by an Index.
scores = pd.Series(
    [1, 2, 3, 4],
    index=["Piet", "Jan", "Ingrid", "Henk"]
)
scores.index

In [None]:
# Indexing now uses labels.
scores["Jan"]

In [None]:
# And slicing using labels too.
scores["Jan":"Henk"]

In [None]:
# Sort series by index rather than values (sort_values).
scores.sort_index()

In [None]:
# Use datetime values as index for time-series data.
max_temp = pd.Series(
    [
        2.1,
        1.5,
        -0.5,
        -2.1
    ],
    index=[
        dt.datetime(2022, 12, 15),  # Thursday   15-12-2022
        dt.datetime(2022, 12, 16),  # Friday     16-12-2022
        # Weekend has no data ...
        dt.datetime(2022, 12, 19),  # Monday     19-12-2022
        dt.datetime(2022, 12, 20),  # Tuesday    20-12-2022
    ]
)
max_temp

In [None]:
# Series has a DatetimeIndex.
# Note: Index has frequency, but set to None.
max_temp.index

In [None]:
# Resample can be used to set / change the frequency.
# Note: Use interpolate() to fill missing values.
interpolated = (
    max_temp
    .resample("1D")
    .interpolate()
)
interpolated

In [None]:
interpolated.index

## Index Alignment

In [83]:
# Dummy data.
a = pd.Series(["A0", "A1", "A2"], index=[0, 1, 2])
b = pd.Series(["B2", "B1", "B0"], index=[2, 1, 0])

In [None]:
# Concatenate values from two series.
# Note: values are aligned using their indices!
a + " - " + b

In [None]:
# Shared indices get a result, rest is missing.
c = pd.Series(["C1", "C2", "C3"], index=[1, 2, 3])
a + " - " + c

## Conditional Selections

In [None]:
# Dummy data
num = pd.Series([-2, -1, 0, 1, 2])
num

In [None]:
# Select using boolean values.
num[
    [False, False, True, True , True]
]

In [None]:
# Boolean operators return a Series of boolean values.
num >= 0

In [None]:
# Use boolean Series to select values.
num[num >= 0]

In [None]:
# Combine conditions using | or &.
# Note: Requires brackets!
num[(num < 0) | (num > 0)]

## Accessors

In [93]:
# Series of names.
names = pd.Series(["john", "JANE", "Jack"])

In [None]:
# Using a lambda to fix case.
names.map(lambda n: n.capitalize())

In [None]:
# Or using the .str accessor of the series.
names.str.capitalize()

In [None]:
# Can perform multiple operations through accessors.
# Note: Maybe not ideal for readability...
names.str.strip().str.capitalize()

In [None]:
# Generate dummy dates.
dates = pd.Series(pd.date_range("2022-01-01", "2022-01-05"))
dates

In [None]:
# Use .dt accessor to get the day.
dates.dt.day

In [None]:
# Use date methods like strftime().
dates.dt.strftime("%A %d %B %Y")