# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,-0.670156,0.207205,0.864135,0.632219
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347
2022-06-24,1.347373,0.204287,-0.226814,0.144728
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-26,-0.013676,-0.149062,1.530131,1.043508
2022-06-27,0.068748,-1.56131,-0.240403,1.003903
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,-0.670156,0.207205,0.864135,0.632219
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,-0.013676,-0.149062,1.530131,1.043508
2022-06-27,0.068748,-1.56131,-0.240403,1.003903
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[-0.67015592,  0.20720529,  0.86413527,  0.63221944],
       [ 0.14011671, -0.12502832, -0.0373886 ,  1.87751083],
       [-0.5785034 , -1.12237241, -0.68684468,  0.35544285],
       [-1.63888367, -0.09360864, -2.03300506, -1.08492148],
       [-0.46983286, -0.4428562 , -1.3378387 ,  0.82734687],
       [ 1.34737282,  0.20428705, -0.22681445,  0.14472787],
       [ 0.60500558,  0.40289077,  0.44214263,  0.96875033],
       [-0.01367608, -0.14906155,  1.53013133,  1.04350758],
       [ 0.06874772, -1.56131032, -0.24040324,  1.00390279],
       [-0.74810268, -0.19295959, -1.43314731, -0.4182937 ]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.195791,-0.287281,-0.315903,0.535019
std,0.821433,0.614551,1.097715,0.835418
min,-1.638884,-1.56131,-2.033005,-1.084921
25%,-0.647243,-0.380382,-1.17509,0.197407
50%,-0.241754,-0.137045,-0.233609,0.729783
75%,0.122274,0.129813,0.32226,0.995115
max,1.347373,0.402891,1.530131,1.877511


1. **count** = Count number of non-NA/null observations
1. **max** = Maximum of the values in the object
1. **min** = Minimum of the values in the object
1. **mean** = Mean of the values
1. **std** = Standard deviation of the observations
1. **25%** = Default lower percentile
1. **50%** = 50 percentile - same as the median 
1. **75%** = Default upper percentile

In [15]:
# change the percentiles
df.describe(percentiles=[.1, .5, .9])

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,-0.195791,-0.287281,-0.315903,0.535019
std,0.821433,0.614551,1.097715,0.835418
min,-1.638884,-1.56131,-2.033005,-1.084921
10%,-0.837181,-1.166266,-1.493133,-0.484956
50%,-0.241754,-0.137045,-0.233609,0.729783
90%,0.679242,0.226774,0.930735,1.126908
max,1.347373,0.402891,1.530131,1.877511


In [63]:
# E, F, G in d2 are not numeric, so do not come up in describe()
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [64]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,-0.670156,0.140117,-0.578503,-1.638884,-0.469833,1.347373,0.605006,-0.013676,0.068748,-0.748103
B,0.207205,-0.125028,-1.122372,-0.093609,-0.442856,0.204287,0.402891,-0.149062,-1.56131,-0.19296
C,0.864135,-0.037389,-0.686845,-2.033005,-1.337839,-0.226814,0.442143,1.530131,-0.240403,-1.433147
D,0.632219,1.877511,0.355443,-1.084921,0.827347,0.144728,0.96875,1.043508,1.003903,-0.418294
F,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0


In [18]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,0.632219,0.864135,0.207205,-0.670156
2022-06-20,1.877511,-0.037389,-0.125028,0.140117
2022-06-21,0.355443,-0.686845,-1.122372,-0.578503
2022-06-22,-1.084921,-2.033005,-0.093609,-1.638884
2022-06-23,0.827347,-1.337839,-0.442856,-0.469833
2022-06-24,0.144728,-0.226814,0.204287,1.347373
2022-06-25,0.96875,0.442143,0.402891,0.605006
2022-06-26,1.043508,1.530131,-0.149062,-0.013676
2022-06-27,1.003903,-0.240403,-1.56131,0.068748
2022-06-28,-0.418294,-1.433147,-0.19296,-0.748103


In [19]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294
2022-06-27,0.068748,-1.56131,-0.240403,1.003903
2022-06-26,-0.013676,-0.149062,1.530131,1.043508
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-24,1.347373,0.204287,-0.226814,0.144728
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-19,-0.670156,0.207205,0.864135,0.632219


In [20]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294
2022-06-19,-0.670156,0.207205,0.864135,0.632219
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347
2022-06-26,-0.013676,-0.149062,1.530131,1.043508
2022-06-27,0.068748,-1.56131,-0.240403,1.003903
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-24,1.347373,0.204287,-0.226814,0.144728


In [21]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [22]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [23]:
# selecting a single column returns a Series object
df["A"]

2022-06-19   -0.670156
2022-06-20    0.140117
2022-06-21   -0.578503
2022-06-22   -1.638884
2022-06-23   -0.469833
2022-06-24    1.347373
2022-06-25    0.605006
2022-06-26   -0.013676
2022-06-27    0.068748
2022-06-28   -0.748103
Freq: D, Name: A, dtype: float64

In [24]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347


## Selection by label

In [25]:
# selecting based on a label
df.loc[dates[0]]

A   -0.670156
B    0.207205
C    0.864135
D    0.632219
Name: 2022-06-19 00:00:00, dtype: float64

In [26]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,-0.670156,0.207205
2022-06-20,0.140117,-0.125028
2022-06-21,-0.578503,-1.122372
2022-06-22,-1.638884,-0.093609
2022-06-23,-0.469833,-0.442856
2022-06-24,1.347373,0.204287
2022-06-25,0.605006,0.402891
2022-06-26,-0.013676,-0.149062
2022-06-27,0.068748,-1.56131
2022-06-28,-0.748103,-0.19296


In [27]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    0.140117
B   -0.125028
C   -0.037389
D    1.877511
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [28]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

-0.6701559150514164

In [29]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

-0.6701559150514164

## Selection by position

In [30]:
df.iloc[2]

A   -0.578503
B   -1.122372
C   -0.686845
D    0.355443
Name: 2022-06-21 00:00:00, dtype: float64

In [31]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,0.140117,-0.125028
2022-06-21,-0.578503,-1.122372
2022-06-22,-1.638884,-0.093609
2022-06-23,-0.469833,-0.442856


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,-0.670156,0.864135
2022-06-20,0.140117,-0.037389
2022-06-21,-0.578503,-0.686845
2022-06-25,0.605006,0.442143


In [33]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,0.864135,0.207205,-0.670156,0.864135
2022-06-20,-0.037389,-0.125028,0.140117,-0.037389
2022-06-21,-0.686845,-1.122372,-0.578503,-0.686845
2022-06-25,0.442143,0.402891,0.605006,0.442143


In [34]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443


In [35]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,0.864135,0.632219
2022-06-20,-0.037389,1.877511
2022-06-21,-0.686845,0.355443
2022-06-22,-2.033005,-1.084921
2022-06-23,-1.337839,0.827347
2022-06-24,-0.226814,0.144728
2022-06-25,0.442143,0.96875
2022-06-26,1.530131,1.043508
2022-06-27,-0.240403,1.003903
2022-06-28,-1.433147,-0.418294


In [36]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,-0.670156,0.207205,0.864135,0.632219
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347
2022-06-24,1.347373,0.204287,-0.226814,0.144728
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-26,-0.013676,-0.149062,1.530131,1.043508
2022-06-27,0.068748,-1.56131,-0.240403,1.003903
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294


In [37]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

-0.03738859765479214

In [38]:
#
# method two: use iat
df.iat[1, 2]

-0.03738859765479214

## Boolean Indexing

In [39]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-20,0.140117,-0.125028,-0.037389,1.877511
2022-06-24,1.347373,0.204287,-0.226814,0.144728
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-27,0.068748,-1.56131,-0.240403,1.003903


In [40]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,,0.207205,0.864135,0.632219
2022-06-20,0.140117,,,1.877511
2022-06-21,,,,0.355443
2022-06-22,,,,
2022-06-23,,,,0.827347
2022-06-24,1.347373,0.204287,,0.144728
2022-06-25,0.605006,0.402891,0.442143,0.96875
2022-06-26,,,1.530131,1.043508
2022-06-27,0.068748,,,1.003903
2022-06-28,,,,


In [41]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,-0.670156,0.207205,0.864135,0.632219,one
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,two
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,three
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,four
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347,two
2022-06-24,1.347373,0.204287,-0.226814,0.144728,five
2022-06-25,0.605006,0.402891,0.442143,0.96875,one
2022-06-26,-0.013676,-0.149062,1.530131,1.043508,two
2022-06-27,0.068748,-1.56131,-0.240403,1.003903,three
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294,four


In [42]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,two
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347,two
2022-06-24,1.347373,0.204287,-0.226814,0.144728,five
2022-06-26,-0.013676,-0.149062,1.530131,1.043508,two


## Setting values

In [43]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [44]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,12
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347,15
2022-06-24,1.347373,0.204287,-0.226814,0.144728,16
2022-06-25,0.605006,0.402891,0.442143,0.96875,17
2022-06-26,-0.013676,-0.149062,1.530131,1.043508,18
2022-06-27,0.068748,-1.56131,-0.240403,1.003903,19
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294,20


In [45]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,12
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347,15
2022-06-24,1.347373,0.204287,-0.226814,0.144728,16
2022-06-25,0.605006,0.402891,0.442143,0.96875,17
2022-06-26,-0.013676,-0.149062,1.530131,1.043508,18
2022-06-27,0.068748,-1.56131,-0.240403,1.003903,19
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294,20


In [46]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,0.864135,0.632219,11
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,12
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14
2022-06-23,-0.469833,-0.442856,-1.337839,0.827347,15
2022-06-24,1.347373,0.204287,-0.226814,0.144728,16
2022-06-25,0.605006,0.402891,0.442143,0.96875,17
2022-06-26,-0.013676,-0.149062,1.530131,1.043508,18
2022-06-27,0.068748,-1.56131,-0.240403,1.003903,19
2022-06-28,-0.748103,-0.19296,-1.433147,-0.418294,20


In [47]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,0.864135,5,11
2022-06-20,0.140117,-0.125028,-0.037389,5,12
2022-06-21,-0.578503,-1.122372,-0.686845,5,13
2022-06-22,-1.638884,-0.093609,-2.033005,5,14
2022-06-23,-0.469833,-0.442856,-1.337839,5,15
2022-06-24,1.347373,0.204287,-0.226814,5,16
2022-06-25,0.605006,0.402891,0.442143,5,17
2022-06-26,-0.013676,-0.149062,1.530131,5,18
2022-06-27,0.068748,-1.56131,-0.240403,5,19
2022-06-28,-0.748103,-0.19296,-1.433147,5,20


In [48]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.864135,-5,-11
2022-06-20,-0.140117,-0.125028,-0.037389,-5,-12
2022-06-21,-0.578503,-1.122372,-0.686845,-5,-13
2022-06-22,-1.638884,-0.093609,-2.033005,-5,-14
2022-06-23,-0.469833,-0.442856,-1.337839,-5,-15
2022-06-24,-1.347373,-0.204287,-0.226814,-5,-16
2022-06-25,-0.605006,-0.402891,-0.442143,-5,-17
2022-06-26,-0.013676,-0.149062,-1.530131,-5,-18
2022-06-27,-0.068748,-1.56131,-0.240403,-5,-19
2022-06-28,-0.748103,-0.19296,-1.433147,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [49]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,
2022-06-20,0.140117,-0.125028,-0.037389,1.877511,12,
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,


### handling missing data
1. _```np.nan```_
1. _```pandas.isna()```_
1. ```df.dropna()```
1. ```df.fillna()```

In [50]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,
2022-06-20,,-0.125028,-0.037389,1.877511,12,0.0
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [51]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [52]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [53]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,
2022-06-20,,-0.125028,-0.037389,1.877511,12,0.0
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [54]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,
2022-06-20,,-0.125028,-0.037389,1.877511,12,0.0
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [55]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
df_no_na = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

In [56]:
# all rows/cols with missing data stripped
df_no_na

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [57]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,
2022-06-20,,-0.125028,-0.037389,1.877511,12,0.0
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [58]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


In [59]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,-0.670156,0.207205,0.864135,0.632219,11,3141.592654
2022-06-20,3141.592654,-0.125028,-0.037389,1.877511,12,0.0
2022-06-21,-0.578503,-1.122372,-0.686845,0.355443,13,0.0
2022-06-22,-1.638884,-0.093609,-2.033005,-1.084921,14,0.0


# Operations on data

## Stats

Operations in general exclude missing data

In [60]:
# arithmetic mean, for each column (axis = 0)
df.mean()

A    -0.195791
B    -0.287281
C    -0.315903
D     0.535019
F    15.500000
dtype: float64

In [61]:
# mean across a row (axis = 1)
df.mean(1)

2022-06-19    2.406681
2022-06-20    2.771042
2022-06-21    2.193544
2022-06-22    1.829916
2022-06-23    2.715364
2022-06-24    3.493915
2022-06-25    3.883758
2022-06-26    4.082180
2022-06-27    3.654187
2022-06-28    3.441499
Freq: D, dtype: float64