Pandas documentation:
https://pd.pydata.org/docs/user_guide/

In [None]:
import numpy as np
import pandas as pd

#### Creating DataFrame

In [None]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns

dates = pd.date_range(start="2020-01-01", periods=6, freq='D')
print(f"dates:\n{dates}\n")

data = np.random.randn(6, 4) # numpy 6x4 matrix
print(f"data:\n{data}\n")

columns = list("ABCD")
print(f"columns:\n{columns}\n")

df = pd.DataFrame(data=data, index=dates, columns=columns)

df

In [None]:
# Creating a DataFrame by passing a dictionary of objects

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20200101"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2

#### Viewing data

In [None]:
df.head(3)

In [None]:
df.tail(3)

In [None]:
df.index

In [None]:
df.columns

In [None]:
# df.values
df.to_numpy()

In [None]:
df.info()

In [None]:
df.describe()

#### Operations

In [None]:
df.mean()

In [None]:
df.mean(axis='index') # axis=0

In [None]:
df.mean(axis='columns') # axis=1

In [None]:
# 1) vectorization
df["SUM"] = df[['A', 'B', 'C', 'D']].sum(axis='columns')
print(df)

# 2) using apply
df["SUM"] = df[['A', 'B', 'C', 'D']].apply(lambda row: sum(row), axis='columns')
print(df)

# 3) using iterrows (to be avoided)
for i, row in df[['A', 'B', 'C', 'D']].iterrows():
    row["SUM"] = sum(row)
print(df)



#### Slicing DataFrame

In [None]:
df["A"]

In [None]:
df.loc[:, ["A", "B"]]

In [None]:
df.loc["20200101":"20200102", ["A", "B"]] # both endpoints are included

In [None]:
s = df.loc["20200101", ["A", "B"]] # reduction in the dimensions of the returned object: df --> series
print(type(s))
print(s)

In [None]:
v = df.loc["20200101", "A"] # reduction in the dimensions of the returned object: df --> cell value
print(type(v))
print(v)

In [None]:
df[df["A"] > 0]

In [None]:
df[df["A"].between(0,1)]

In [None]:
filter_index = pd.date_range("20200101", periods=2, freq='2D')
print(filter_index)

df[df.index.isin(filter_index)]

#### Setting values

In [None]:
df['E'] = 1

df

In [None]:
df.at["2020-01-01", "E"] = 0

df

In [None]:
df.loc[df['A']>0, 'E'] = 1000

df

#### Missing data

In [None]:
df = pd.DataFrame(
    {
        "col1": ["a", "b", np.nan, 3],
        "col2": [3, np.nan, np.nan, 5],
        "col3": [np.nan, np.nan, np.nan, np.nan]
    }
)

df

In [None]:
df.info()

In [None]:
# Warning! np.nan != np.nan
np.nan == np.nan

In [None]:
None == None

In [None]:
# To get the boolean mask where values are nan
pd.isna(df)

In [None]:
# To replace nan
df.fillna("AAA")

In [None]:
# To drop nan
df.dropna(how="any") # rows with at least one nan

In [None]:
df.dropna(how="all") # rows with only nan

#### Merge

In [None]:
df_left = pd.DataFrame(
    {
        "A": [1,2,3,4],
        "B": [5,6,7,8]
    }
)
print(f'df_left:\n{df_left}\n')

df_right = pd.DataFrame(
    {
        "C": [3,4,5,6],
        "D": [8,8,8,8]
    }
)
print(f'df_right:\n{df_right}\n')

for merge_mode in ['inner', 'left', 'right', 'outer']:
    print(f'merge {merge_mode}:')
    print(df_left.merge(df_right, left_on=['A'], right_on=['C'], how=merge_mode))
    print()

#### Grouping

In [None]:
n_rows = 6
df = pd.DataFrame(
    {
        "Code": np.random.choice(["C1", "C2", "C3"], n_rows),
        "Type": np.random.choice(["good", "bad"], n_rows),
        "Value": np.random.randint(low=1, high=10, size=n_rows)
    }
)

df

In [None]:
df.groupby(by=["Code", "Type"]).sum()

In [None]:
df.groupby(by=["Code"]).sum()

In [None]:
df.groupby(by=["Code"]).agg({'Type': ' '.join, 'Value': 'sum'})

#### Plotting

https://pandas.pydata.org/docs/user_guide/visualization.html

In [None]:
import matplotlib.pyplot as plt

In [None]:
df = pd.DataFrame(np.random.randn(1000, 4), index=pd.date_range("1/1/2000", periods=1000), columns=list("ABCD"))

In [None]:
df = df.cumsum()

In [None]:
df.plot();

In [None]:
for col in df.columns:
    plt.plot(df.index, df[col], label=f'{col}')

plt.gcf().autofmt_xdate()
plt.legend()
plt.show()

In [None]:
df2 = pd.DataFrame(np.random.rand(3, 4), index=['dog', 'cat', 'fish'], columns=["a", "b", "c", "d"])
df2.plot.bar(stacked=True);

In [None]:
df2.plot.barh(stacked=True);

In [None]:
df4 = pd.DataFrame(
    {
        "a": np.random.randn(1000) + 1,
        "b": np.random.randn(1000),
        "c": np.random.randn(1000) - 1,
    },
    columns=["a", "b", "c"],
)
#df4

In [None]:
df4.plot.hist(alpha=0.6, bins=20, edgecolor='black');

In [None]:
df4["a"].plot.hist(orientation="horizontal", cumulative=True, edgecolor='black');

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(df4, alpha=0.2, figsize=(6, 6), diagonal="kde");


In [None]:
df4['a'].plot.hist(density=True, edgecolor='black');
df4['a'].plot.kde(color='orange', lw=5);
"""
In statistics, kernel density estimation (KDE) is a way 
to estimate the probability density function (PDF) of a random variable.
"""

#### Getting data in/out
https://pandas.pydata.org/docs/reference/io.html