# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,0.025674,0.490985,-0.14758,-1.859907
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-23,0.37952,0.684078,1.924192,-0.661438
2022-06-24,-0.480685,1.66622,-0.771596,0.56963
2022-06-25,0.841982,1.272869,0.041671,-0.069634
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-27,1.796318,-1.392925,1.04685,1.251253
2022-06-28,0.660456,0.173437,-0.884594,-0.211071


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,0.025674,0.490985,-0.14758,-1.859907
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-23,0.37952,0.684078,1.924192,-0.661438


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-27,1.796318,-1.392925,1.04685,1.251253
2022-06-28,0.660456,0.173437,-0.884594,-0.211071


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[ 0.02567374,  0.49098474, -0.14757985, -1.85990692],
       [ 0.88562627, -0.75418425,  0.63507186,  0.65201279],
       [-0.66665847,  1.09113427, -0.7651741 ,  0.08705828],
       [ 1.46417241,  0.48129063,  0.11924674, -0.29549383],
       [ 0.37952042,  0.68407798,  1.92419197, -0.66143843],
       [-0.48068513,  1.66622013, -0.77159552,  0.56962988],
       [ 0.84198228,  1.27286929,  0.04167144, -0.06963396],
       [ 0.50187841, -0.06459465,  0.91290347, -0.49659206],
       [ 1.79631762, -1.39292508,  1.04684995,  1.25125289],
       [ 0.66045601,  0.1734374 , -0.88459415, -0.21107121]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.540828,0.364831,0.211099,-0.103418
std,0.777971,0.927362,0.918367,0.849571
min,-0.666658,-1.392925,-0.884594,-1.859907
25%,0.114135,-0.005087,-0.610776,-0.446318
50%,0.581167,0.486138,0.080459,-0.140353
75%,0.874715,0.98937,0.843446,0.448987
max,1.796318,1.66622,1.924192,1.251253


1. **count** = Count number of non-NA/null observations
1. **max** = Maximum of the values in the object
1. **min** = Minimum of the values in the object
1. **mean** = Mean of the values
1. **std** = Standard deviation of the observations
1. **25%** = Default lower percentile
1. **50%** = 50 percentile - same as the median 
1. **75%** = Default upper percentile

In [15]:
# change the percentiles
df.describe(percentiles=[.1, .5, .9])

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.540828,0.364831,0.211099,-0.103418
std,0.777971,0.927362,0.918367,0.849571
min,-0.666658,-1.392925,-0.884594,-1.859907
10%,-0.499282,-0.818058,-0.782895,-0.781285
50%,0.581167,0.486138,0.080459,-0.140353
90%,1.497387,1.312204,1.134584,0.711937
max,1.796318,1.66622,1.924192,1.251253


In [16]:
# E, F, G in d2 are not numeric, so do not come up in describe()
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [17]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,0.025674,0.885626,-0.666658,1.464172,0.37952,-0.480685,0.841982,0.501878,1.796318,0.660456
B,0.490985,-0.754184,1.091134,0.481291,0.684078,1.66622,1.272869,-0.064595,-1.392925,0.173437
C,-0.14758,0.635072,-0.765174,0.119247,1.924192,-0.771596,0.041671,0.912903,1.04685,-0.884594
D,-1.859907,0.652013,0.087058,-0.295494,-0.661438,0.56963,-0.069634,-0.496592,1.251253,-0.211071


In [18]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-1.859907,-0.14758,0.490985,0.025674
2022-06-20,0.652013,0.635072,-0.754184,0.885626
2022-06-21,0.087058,-0.765174,1.091134,-0.666658
2022-06-22,-0.295494,0.119247,0.481291,1.464172
2022-06-23,-0.661438,1.924192,0.684078,0.37952
2022-06-24,0.56963,-0.771596,1.66622,-0.480685
2022-06-25,-0.069634,0.041671,1.272869,0.841982
2022-06-26,-0.496592,0.912903,-0.064595,0.501878
2022-06-27,1.251253,1.04685,-1.392925,1.796318
2022-06-28,-0.211071,-0.884594,0.173437,0.660456


In [19]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,0.660456,0.173437,-0.884594,-0.211071
2022-06-27,1.796318,-1.392925,1.04685,1.251253
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-25,0.841982,1.272869,0.041671,-0.069634
2022-06-24,-0.480685,1.66622,-0.771596,0.56963
2022-06-23,0.37952,0.684078,1.924192,-0.661438
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-19,0.025674,0.490985,-0.14758,-1.859907


In [20]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-24,-0.480685,1.66622,-0.771596,0.56963
2022-06-19,0.025674,0.490985,-0.14758,-1.859907
2022-06-23,0.37952,0.684078,1.924192,-0.661438
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-28,0.660456,0.173437,-0.884594,-0.211071
2022-06-25,0.841982,1.272869,0.041671,-0.069634
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-27,1.796318,-1.392925,1.04685,1.251253


In [21]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [22]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [23]:
# selecting a single column returns a Series object
df["A"]

2022-06-19    0.025674
2022-06-20    0.885626
2022-06-21   -0.666658
2022-06-22    1.464172
2022-06-23    0.379520
2022-06-24   -0.480685
2022-06-25    0.841982
2022-06-26    0.501878
2022-06-27    1.796318
2022-06-28    0.660456
Freq: D, Name: A, dtype: float64

In [24]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-23,0.37952,0.684078,1.924192,-0.661438


## Selection by label

In [25]:
# selecting based on a label
df.loc[dates[0]]

A    0.025674
B    0.490985
C   -0.147580
D   -1.859907
Name: 2022-06-19 00:00:00, dtype: float64

In [26]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,0.025674,0.490985
2022-06-20,0.885626,-0.754184
2022-06-21,-0.666658,1.091134
2022-06-22,1.464172,0.481291
2022-06-23,0.37952,0.684078
2022-06-24,-0.480685,1.66622
2022-06-25,0.841982,1.272869
2022-06-26,0.501878,-0.064595
2022-06-27,1.796318,-1.392925
2022-06-28,0.660456,0.173437


In [27]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    0.885626
B   -0.754184
C    0.635072
D    0.652013
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [28]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

0.02567374008451191

In [29]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

0.02567374008451191

## Selection by position

In [30]:
df.iloc[2]

A   -0.666658
B    1.091134
C   -0.765174
D    0.087058
Name: 2022-06-21 00:00:00, dtype: float64

In [31]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,0.885626,-0.754184
2022-06-21,-0.666658,1.091134
2022-06-22,1.464172,0.481291
2022-06-23,0.37952,0.684078


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,0.025674,-0.14758
2022-06-20,0.885626,0.635072
2022-06-21,-0.666658,-0.765174
2022-06-25,0.841982,0.041671


In [33]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.14758,0.490985,0.025674,-0.14758
2022-06-20,0.635072,-0.754184,0.885626,0.635072
2022-06-21,-0.765174,1.091134,-0.666658,-0.765174
2022-06-25,0.041671,1.272869,0.841982,0.041671


In [34]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-21,-0.666658,1.091134,-0.765174,0.087058


In [35]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-0.14758,-1.859907
2022-06-20,0.635072,0.652013
2022-06-21,-0.765174,0.087058
2022-06-22,0.119247,-0.295494
2022-06-23,1.924192,-0.661438
2022-06-24,-0.771596,0.56963
2022-06-25,0.041671,-0.069634
2022-06-26,0.912903,-0.496592
2022-06-27,1.04685,1.251253
2022-06-28,-0.884594,-0.211071


In [36]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,0.025674,0.490985,-0.14758,-1.859907
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-21,-0.666658,1.091134,-0.765174,0.087058
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-23,0.37952,0.684078,1.924192,-0.661438
2022-06-24,-0.480685,1.66622,-0.771596,0.56963
2022-06-25,0.841982,1.272869,0.041671,-0.069634
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-27,1.796318,-1.392925,1.04685,1.251253
2022-06-28,0.660456,0.173437,-0.884594,-0.211071


In [37]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

0.6350718607310173

In [38]:
#
# method two: use iat
df.iat[1, 2]

0.6350718607310173

## Boolean Indexing

In [39]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.025674,0.490985,-0.14758,-1.859907
2022-06-20,0.885626,-0.754184,0.635072,0.652013
2022-06-22,1.464172,0.481291,0.119247,-0.295494
2022-06-23,0.37952,0.684078,1.924192,-0.661438
2022-06-25,0.841982,1.272869,0.041671,-0.069634
2022-06-26,0.501878,-0.064595,0.912903,-0.496592
2022-06-27,1.796318,-1.392925,1.04685,1.251253
2022-06-28,0.660456,0.173437,-0.884594,-0.211071


In [40]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.025674,0.490985,,
2022-06-20,0.885626,,0.635072,0.652013
2022-06-21,,1.091134,,0.087058
2022-06-22,1.464172,0.481291,0.119247,
2022-06-23,0.37952,0.684078,1.924192,
2022-06-24,,1.66622,,0.56963
2022-06-25,0.841982,1.272869,0.041671,
2022-06-26,0.501878,,0.912903,
2022-06-27,1.796318,,1.04685,1.251253
2022-06-28,0.660456,0.173437,,


In [41]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,one
2022-06-20,0.885626,-0.754184,0.635072,0.652013,two
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,three
2022-06-22,1.464172,0.481291,0.119247,-0.295494,four
2022-06-23,0.37952,0.684078,1.924192,-0.661438,two
2022-06-24,-0.480685,1.66622,-0.771596,0.56963,five
2022-06-25,0.841982,1.272869,0.041671,-0.069634,one
2022-06-26,0.501878,-0.064595,0.912903,-0.496592,two
2022-06-27,1.796318,-1.392925,1.04685,1.251253,three
2022-06-28,0.660456,0.173437,-0.884594,-0.211071,four


In [42]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,0.885626,-0.754184,0.635072,0.652013,two
2022-06-23,0.37952,0.684078,1.924192,-0.661438,two
2022-06-24,-0.480685,1.66622,-0.771596,0.56963,five
2022-06-26,0.501878,-0.064595,0.912903,-0.496592,two


## Setting values

In [43]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [44]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11
2022-06-20,0.885626,-0.754184,0.635072,0.652013,12
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14
2022-06-23,0.37952,0.684078,1.924192,-0.661438,15
2022-06-24,-0.480685,1.66622,-0.771596,0.56963,16
2022-06-25,0.841982,1.272869,0.041671,-0.069634,17
2022-06-26,0.501878,-0.064595,0.912903,-0.496592,18
2022-06-27,1.796318,-1.392925,1.04685,1.251253,19
2022-06-28,0.660456,0.173437,-0.884594,-0.211071,20


In [45]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11
2022-06-20,0.885626,-0.754184,0.635072,0.652013,12
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14
2022-06-23,0.37952,0.684078,1.924192,-0.661438,15
2022-06-24,-0.480685,1.66622,-0.771596,0.56963,16
2022-06-25,0.841982,1.272869,0.041671,-0.069634,17
2022-06-26,0.501878,-0.064595,0.912903,-0.496592,18
2022-06-27,1.796318,-1.392925,1.04685,1.251253,19
2022-06-28,0.660456,0.173437,-0.884594,-0.211071,20


In [46]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.14758,-1.859907,11
2022-06-20,0.885626,-0.754184,0.635072,0.652013,12
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14
2022-06-23,0.37952,0.684078,1.924192,-0.661438,15
2022-06-24,-0.480685,1.66622,-0.771596,0.56963,16
2022-06-25,0.841982,1.272869,0.041671,-0.069634,17
2022-06-26,0.501878,-0.064595,0.912903,-0.496592,18
2022-06-27,1.796318,-1.392925,1.04685,1.251253,19
2022-06-28,0.660456,0.173437,-0.884594,-0.211071,20


In [47]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.14758,5,11
2022-06-20,0.885626,-0.754184,0.635072,5,12
2022-06-21,-0.666658,1.091134,-0.765174,5,13
2022-06-22,1.464172,0.481291,0.119247,5,14
2022-06-23,0.37952,0.684078,1.924192,5,15
2022-06-24,-0.480685,1.66622,-0.771596,5,16
2022-06-25,0.841982,1.272869,0.041671,5,17
2022-06-26,0.501878,-0.064595,0.912903,5,18
2022-06-27,1.796318,-1.392925,1.04685,5,19
2022-06-28,0.660456,0.173437,-0.884594,5,20


In [48]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.14758,-5,-11
2022-06-20,-0.885626,-0.754184,-0.635072,-5,-12
2022-06-21,-0.666658,-1.091134,-0.765174,-5,-13
2022-06-22,-1.464172,-0.481291,-0.119247,-5,-14
2022-06-23,-0.37952,-0.684078,-1.924192,-5,-15
2022-06-24,-0.480685,-1.66622,-0.771596,-5,-16
2022-06-25,-0.841982,-1.272869,-0.041671,-5,-17
2022-06-26,-0.501878,-0.064595,-0.912903,-5,-18
2022-06-27,-1.796318,-1.392925,-1.04685,-5,-19
2022-06-28,-0.660456,-0.173437,-0.884594,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [49]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,
2022-06-20,0.885626,-0.754184,0.635072,0.652013,12,
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,


### handling missing data
1. _```np.nan```_
1. _```pandas.isna()```_
1. ```df.dropna()```
1. ```df.fillna()```

In [50]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,
2022-06-20,,-0.754184,0.635072,0.652013,12,0.0
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [51]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [52]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [53]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,
2022-06-20,,-0.754184,0.635072,0.652013,12,0.0
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [54]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,
2022-06-20,,-0.754184,0.635072,0.652013,12,0.0
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [55]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
df_no_na = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

In [56]:
# all rows/cols with missing data stripped
df_no_na

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [57]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,
2022-06-20,,-0.754184,0.635072,0.652013,12,0.0
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [58]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


In [59]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.025674,0.490985,-0.14758,-1.859907,11,3141.592654
2022-06-20,3141.592654,-0.754184,0.635072,0.652013,12,0.0
2022-06-21,-0.666658,1.091134,-0.765174,0.087058,13,0.0
2022-06-22,1.464172,0.481291,0.119247,-0.295494,14,0.0


# Align two Dataframes

There needs to be a bigger notebook for this topic.
You need to know there's "joins" in Pandas just like in the SQL world, 
like join and left join and right join and inner and outer and all that...

## Joins

In [60]:
# create 2 datasets

# date range indexes
idx1 = pd.date_range('2022-01-01',periods = 10)
# 2022-01-01', '2022-01-02' don't exist in idx2
# '2022-01-11', '2022-01-12' don't exist in idx1
idx2 = pd.date_range('2022-01-03',periods = 10)

In [61]:
# dataframes from indexes
d1 = pd.DataFrame(index = idx1, data = 
    {
        'A': np.random.rand(10),
        'B': np.random.randint(1, high=25, size = 10)
    }
)

d2 = pd.DataFrame(index = idx2, data = 
    {
        'A': np.random.rand(10),
        'B': np.random.randint(1, high=25, size = 10)
    }
)

In [62]:
# we gon need to display both data frames side by side, so...
from IPython.display import display_html 
d1_styler = d1.style.set_table_attributes("style='display:inline'")
d2_styler = d2.style.set_table_attributes("style='display:inline'")
display_html(d1_styler._repr_html_()+d2_styler._repr_html_(), raw=True)

Unnamed: 0,A,B
2022-01-01 00:00:00,0.267573,4
2022-01-02 00:00:00,0.715754,4
2022-01-03 00:00:00,0.207174,23
2022-01-04 00:00:00,0.605577,4
2022-01-05 00:00:00,0.970176,1
2022-01-06 00:00:00,0.093801,18
2022-01-07 00:00:00,0.947103,5
2022-01-08 00:00:00,0.132143,13
2022-01-09 00:00:00,0.280362,16
2022-01-10 00:00:00,0.852558,18

Unnamed: 0,A,B
2022-01-03 00:00:00,0.933942,19
2022-01-04 00:00:00,0.135483,14
2022-01-05 00:00:00,0.796242,23
2022-01-06 00:00:00,0.707483,17
2022-01-07 00:00:00,0.477206,19
2022-01-08 00:00:00,0.780888,8
2022-01-09 00:00:00,0.556234,13
2022-01-10 00:00:00,0.33746,7
2022-01-11 00:00:00,0.056972,1
2022-01-12 00:00:00,0.665787,17


The above trick of rendering 2 dataframes side-by-side was from [this](https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side) stackoverflow question.

Table rendering/styling options is [a bigger discussion](https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html#) to be tackled later.

In [63]:
# the + operation is UNION of indexes
d3 = d1+d2
# indexs not present in the other dataframe get a NaN
d3.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,1.141116,42.0
2022-01-04 00:00:00,0.74106,18.0
2022-01-05 00:00:00,1.766418,24.0
2022-01-06 00:00:00,0.801285,35.0
2022-01-07 00:00:00,1.424309,24.0
2022-01-08 00:00:00,0.913031,21.0
2022-01-09 00:00:00,0.836596,29.0
2022-01-10 00:00:00,1.190019,25.0


In [64]:
# do you can obvs remove nan values like before
# the + operation is UNION of indexes
d31 = (d1+d2).dropna()
# indexs not present in the other dataframe get a NaN
d3.style.highlight_null(null_color='red')

Unnamed: 0,A,B
2022-01-01 00:00:00,,
2022-01-02 00:00:00,,
2022-01-03 00:00:00,1.141116,42.0
2022-01-04 00:00:00,0.74106,18.0
2022-01-05 00:00:00,1.766418,24.0
2022-01-06 00:00:00,0.801285,35.0
2022-01-07 00:00:00,1.424309,24.0
2022-01-08 00:00:00,0.913031,21.0
2022-01-09 00:00:00,0.836596,29.0
2022-01-10 00:00:00,1.190019,25.0


# Operations on data

## Stats

Operations in general exclude missing data

In [65]:
# arithmetic mean, for each column (axis = 0)
df.mean()

A     0.540828
B     0.364831
C     0.211099
D    -0.103418
F    15.500000
dtype: float64

In [66]:
# mean across a row (axis = 1)
df.mean(1)

2022-06-19    1.901834
2022-06-20    2.683705
2022-06-21    2.549272
2022-06-22    3.153843
2022-06-23    3.465270
2022-06-24    3.396714
2022-06-25    3.817378
2022-06-26    3.770719
2022-06-27    4.340299
2022-06-28    3.947646
Freq: D, dtype: float64