# DataFrame: Basics

In [None]:
import pandas as pd

In [None]:
# DataFrame construction using a dict.
pd.DataFrame(
    {
        "col_int":   [1, 2, 3, 4],
        "col_float": [1.1, 2.2, 3.3, 4.4],
        "col_str":   list("ABCD"),
    }
)

In [None]:
# Or using a matrix and column / index descriptions.
pd.DataFrame(
    data=(
        (1, 1.1, "A"),
        (2, 2.2, "B"),
        (3, 3.3, "C"),
        (4, 4.4, "D"),
    ),
    columns=["col_int", "col_float", "col_str"],
    index=["a", "b", "c", "d"],
)

In [None]:
# Or just load data from file.
pd.read_csv(
    "../0_data/persons/personal_data.csv",
    header=None,
    names=["name", "gender", "age"],
)

## Descriptive Statistics

In [None]:
# Create dummy data.
df = pd.DataFrame(
    {
        "col_int": [1, 2, 3, 4],
        "col_float": [1.1, 2.2, 3.3, 4.4],
        "col_str": list("ABCD"),
    },
    index=["alpha", "beta", "gamma", "delta"]
)
df

In [None]:
# DataFrame dimensions: Tuple(nrows, ncolumns).
df.shape

In [None]:
# Get column names using .columns attribute.
# Note: returns an Index object.
df.columns

In [None]:
# Row index with row labels.
# Note: defaults to RangeIndex.
df.index

In [None]:
# Column data types.
# Note: returns Series with column name as index.
df.dtypes

In [None]:
# Descriptives are available via .describe().
# Note: Only includes numeric columns by default.
df.describe()

In [None]:
# Specify data types using include parameter
# Example options: "number", "object", "category".
df.describe(include=["object"])

In [None]:
# The info method provides technical details.
df.info()

In [None]:
# Info omits object data from memory usage!
# Use memory_usage="deep" to get actual usage.
df.info(memory_usage="deep")

## Selecting Data

### Sampling

In [None]:
 # Top rows
df.head()

In [None]:
# Bottom N rows
df.tail(2)

In [None]:
# Random sample N
df.sample(2)

In [None]:
# Sample a fraction
df.sample(frac=0.5)

### Selection Patterns

In [None]:
# String => Selects column as a Series.
df["col_int"]

In [None]:
# List => Select columns as a DataFrame.
df[["col_int", "col_float"]]

In [None]:
# List => Single column also returns a DataFrame.
df[["col_int"]]

In [None]:
# Slice => Select rows using a numeric range.
# Note: Selection EXcludes row 2.
df[0:2]

In [None]:
# Slice => Can use index labels instead.
# Note: Selection INcludes "gamma"!
df["alpha":"gamma"]

### Rows and Columns

In [None]:
# Using two separate steps.
# Selects rows first and then columns.
df[0:2][["col_int", "col_float"]]

In [None]:
# Simultaneous selection using .loc[].
# Note: Uses labels for both rows and columns!
df.loc[
    "alpha":"gamma", ["col_int", "col_float"]
]

In [None]:
# Use .iloc[] for positional selection.
# Note: Rows and columns must be supplied as indices!
df.iloc[0:2, [0, 1]]

### Conditional Selection

In [None]:
# Select rows using boolean values.
mask = [True, False, True, False]
df[mask]

In [None]:
# Logical operator creates Series of boolean values.
df["col_int"] < 3

In [None]:
# Use it to select DataFrame rows.
df[df["col_int"] < 3]

In [None]:
# Or use the query method instead.
df.query("col_int < 3")

In [None]:
# Combining multiple conditions is easy using query().
df.query("col_int < 3 or col_str in ('A', 'C')")