# DataFrame: Basics

In [1]:
import pandas as pd

In [None]:
# DataFrame construction using a dict.
pd.DataFrame(
    {
        "col_int":   [1, 2, 3, 4],
        "col_float": [1.1, 2.2, 3.3, 4.4],
        "col_str":   list("ABCD"),
    }
)

In [None]:
# Or using a matrix and column descriptions.
pd.DataFrame(
    data=(
        (1, 1.1, "A"),
        (2, 2.2, "B"),
        (3, 3.3, "C"),
        (4, 4.4, "D"),
    ),
    columns=["col_int", "col_float", "col_str"],
)

In [None]:
# Or just load data from file.
df = pd.read_csv("../0_data/persons/personal_data.csv")
df

In [None]:
# Read CSV casts to suitable data types.
df.dtypes

## Descriptive Statistics

In [None]:
# Create dummy data.
df = pd.DataFrame(
    {
        "col_int": [1, 2, 3, 4],
        "col_float": [1.1, 2.2, 3.3, 4.4],
        "col_str": list("ABCD"),
    },
    index=["alpha", "beta", "gamma", "delta"]
)
df

In [None]:
# DataFrame dimensions: Tuple(nrows, ncolumns).
# Also referred to as (axis0, axis1).
df.shape

In [None]:
# Get column names using .columns attribute.
# Note: returns an Index object.
df.columns

In [None]:
# Row index with row labels.
# Note: defaults to RangeIndex.
df.index

In [None]:
# Descriptives are available via .describe().
# Note: Includes only numeric columns by default.
df.describe()

In [None]:
# Specify data types using include parameter
# Example options: "number", "object", "category".
df.describe(include=["object"])

In [None]:
# The info method provides technical details.
df.info()

In [None]:
# Info omits object data from memory usage!
# Use memory_usage="deep" to get actual usage.
df.info(memory_usage="deep")

## Selecting Data

### Sampling

In [None]:
 # Top rows
df.head()

In [None]:
# Bottom N rows
df.tail(2)

In [None]:
# Random sample N
df.sample(2)

In [None]:
# Sample a fraction
df.sample(frac=0.5)

### Selection Patterns

In [None]:
# String => Selects column as a Series.
df["col_int"]

In [None]:
# List => Returns DataFrame even for single column.
df[["col_int"]]

In [None]:
# Select multiple columns.
df[["col_float", "col_int"]]

In [None]:
# Slice => Select rows using a numeric range.
# Note: Selection *excludes* row 2.
df[0:2]

In [None]:
# Slice => Select rows using index labels.
# Note: Selection *includes* row gamma!
df["alpha":"gamma"]

### Rows and Columns

In [None]:
# Using two separate steps.
# Selects rows first and then columns.
df["alpha":"gamma"][["col_int", "col_float"]]

In [None]:
# Simultaneous selection using .loc[].
# Note: Uses labels for both rows and columns!
df.loc["alpha":"gamma", ["col_int", "col_float"]]

In [None]:
# Use .iloc[] for positional selection.
# Note: Rows and columns must be supplied as indices!
df.iloc[0:3, [0, 1]]

### Conditional Selection

In [None]:
# Select rows using boolean values.
mask = [True, False, True, False]
df[mask]

In [None]:
# Logical operator creates Series of boolean values.
df["col_int"] < 3

In [None]:
# Use it to select DataFrame rows.
df[df["col_int"] < 3]

In [None]:
# Or use the query method instead.
df.query("col_int < 3")

In [None]:
# Combining multiple conditions is easy using query().
df.query("col_int < 3 or col_str in ('A', 'D')")