In [None]:
import numpy as np
import pandas as pd

from utils import list_methods

# DataFrame

- Higher order building block in pandas
- 2 dimensional; rows and columns
- Collection of series

## Construction

In [None]:
# Using matrix and supplying column names
pd.DataFrame(
    data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    columns=list("xyz")
)

In [None]:
# List of dicts, column names are inferred from the keys.
# Note: missing values are introduced when a dict is missing a key.
pd.DataFrame([
    {"x": 1, "y": 2, "z": 3},
    {"x": 4, "y": 5, "z": 6},
    {"x": 7, "y": 8, "z": 9},
])

In [None]:
# Serie-wise using a dict
# Note: all lists must be of equal length.
pd.DataFrame({
    "x": [1, 4, 7],
    "y": [2, 5, 8],
    "z": [3, 6, 9],
})

In [None]:
# Or use one of the read methods
pd.options.display.max_colwidth = None
(
    list_methods(pd)
    .loc[lambda df: df["name"].str.startswith("read")]
)

## Info and descriptives

In [None]:
# Sample data
n = 10
df = pd.DataFrame({
    "x": np.random.randint(0, 100, n),
    "y": np.random.normal(0, 1, n),
    "z": np.random.choice(["ja", "nee"], n),
})
df

In [None]:
# DataFrame shape (# row, # columns).
# Note: tuple values correspond to axis 0 and 1 of the DataFrame.
df.shape

In [None]:
# Print descriptive DataFrame information.
df.info()

In [None]:
# Object columns are omitted by default, while they have the biggest footprint.
# Use memory_usage="deep" to include object columns (slower to compute).
df.info(memory_usage="deep")

In [None]:
# Describe reports only numeric columns by default.
df.describe()

In [None]:
# Use include argument to switch to categorical columns.
# Note: include="all" mixes both numeric and categorical columns.
df.describe(include="object")

In [None]:
# Print top rows, can suply an integer to print more / fewer rows.
df.head() # or .tail()

In [None]:
# Using a random sample is more insightful; rows may differ accross the data set.
df.sample(5)

## Indices

### Rows

In [None]:
# Just like series, DataFrames have a row index.
# Note: all columns (Series) in a DataFrame share the same index.
df.index

In [None]:
# Change the index in place (ugly solution!)
df.index = range(11, 21)
df

In [None]:
# Example: function that makes inplace changes to a DataFrame
def change_index(df):
    df.index = df.index + 10
    return df

In [None]:
# Seems fine; index of prep_df is as we would expect...
prep_df = change_index(df)
prep_df.index

In [None]:
# But... the index of the original DataFrame was also changed!
# Users may not expect this kind of behavior...
df.index

In [None]:
# Let's reset the index to its original range.
df = df.reset_index(drop=True)
df.index

In [None]:
# Nicer way to set the index; use set_index.
# Creates a time-based index.
df_ts = df.set_index(pd.date_range("2020-01-01", "2020-01-10"))
df_ts.index

In [None]:
# Note that the original index is left as-is!
df.index

### Columns

In [None]:
# Column names are also an index.
df.columns

In [None]:
# Rename columns (ugly in-place solution!)
df.columns = ["a", "b", "c"]
df.head(3)

In [None]:
# Use set_axis instead
df_renamed = df.set_axis(["x", "y", "z"], axis=1)
df_renamed.head()

In [None]:
# df still has the original column names.
df.columns

In [None]:
# Use rename method to rename specific columns only.
df_renamed = df.rename(columns={
    "a": "A very long name",
    "c": "Categorical Data"}
)
df_renamed

In [None]:
# Create a rename function that processes one name at a time.
def convert_columns(colname):
    """Converts column names to snake case."""

    return colname.lower().replace(" ", "_")

In [None]:
# Apply the function to each column name
df_renamed.rename(columns=convert_columns)

## Selecting Data

### Rows

In [None]:
# Selection rows using a slice (lower limit:upper limit)
df[0:5]

In [None]:
# Slice with step size of 2.
# Note: lower and upper limit can be omitted to selec all rows.
df[::2]

In [None]:
# Slice using index values (dates in case of a DatatimeIndex).
df_ts["2020-01-01":"2020-01-05"]

In [None]:
# Create a boolean mask from a conditional expression.
mask = df["a"] > 50
mask

In [None]:
# Use the boolean mask to select rows.
df[mask]

In [None]:
# Or simply in one go.
df[df["a"] > 50]

Conclusion: A **slice** or **boolean mask** is used to select rows!

### Columns

In [None]:
# Use a string to select a single column.
df["a"]

In [None]:
# Note that this returns a Series (not a DataFrame).
type(df["a"])

In [None]:
# Using a list - even with a single element - returns a DataFrame.
type(df[["a"]])

In [None]:
# Obiously, a list can be used to select multiple columns too.
df[["a", "c"]]

Conclusion: Use a **string** or **list of strings** to select columns!

### Rows + columns

Use `.loc[]` method to select both rows and columns.

In [None]:
# Selects index values(!) 0 - 5 and columns a + b.
# Note: index slicing includes the upper limit value.
df.loc[0:5, ["a", "b"]]

In [None]:
# Omit limits to select all rows.
df.loc[:, ["a", "b"]]

In [None]:
# Same goes for columns; empty slice selects all.
df.loc[0:3, :]

In [None]:
# Can also use .iloc[] method, but can only use integers.
# Since we are used to column names, this is often impractical...
df.iloc[0:3, [0, 1]]