# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,0.19225,0.517209,-1.52886,-1.079049
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093
2022-06-24,1.110772,0.754143,0.82246,-0.484875
2022-06-25,0.555643,-0.400239,0.137434,0.92336
2022-06-26,-0.170577,0.440875,0.11626,-0.274613
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,0.19225,0.517209,-1.52886,-1.079049
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,-0.170577,0.440875,0.11626,-0.274613
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[ 0.19225007,  0.51720943, -1.52885959, -1.0790487 ],
       [ 0.38953922, -0.81236304,  1.76180155, -0.63710746],
       [-1.3439895 ,  0.36173373, -0.22579217, -0.99274139],
       [ 0.5021048 , -0.85907289,  0.5738456 ,  0.0318816 ],
       [ 0.11181575, -0.5624764 , -0.91638601, -0.3660926 ],
       [ 1.1107722 ,  0.75414252,  0.82245997, -0.48487484],
       [ 0.55564318, -0.40023891,  0.1374345 ,  0.92336029],
       [-0.17057699,  0.44087535,  0.11626024, -0.27461263],
       [ 0.60091684, -2.12385103, -1.8664509 ,  1.00339358],
       [ 0.52568098, -1.98763782, -0.65746606, -1.00442133]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.247416,-0.467168,-0.178315,-0.288026
std,0.654605,1.017347,1.103942,0.748332
min,-1.34399,-2.123851,-1.866451,-1.079049
25%,0.131924,-0.847395,-0.851656,-0.903833
50%,0.445822,-0.481358,-0.054766,-0.425484
75%,0.548153,0.42109,0.464743,-0.044742
max,1.110772,0.754143,1.761802,1.003394


In [15]:
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [16]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,0.19225,0.389539,-1.34399,0.502105,0.111816,1.110772,0.555643,-0.170577,0.600917,0.525681
B,0.517209,-0.812363,0.361734,-0.859073,-0.562476,0.754143,-0.400239,0.440875,-2.123851,-1.987638
C,-1.52886,1.761802,-0.225792,0.573846,-0.916386,0.82246,0.137434,0.11626,-1.866451,-0.657466
D,-1.079049,-0.637107,-0.992741,0.031882,-0.366093,-0.484875,0.92336,-0.274613,1.003394,-1.004421


In [17]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-1.079049,-1.52886,0.517209,0.19225
2022-06-20,-0.637107,1.761802,-0.812363,0.389539
2022-06-21,-0.992741,-0.225792,0.361734,-1.34399
2022-06-22,0.031882,0.573846,-0.859073,0.502105
2022-06-23,-0.366093,-0.916386,-0.562476,0.111816
2022-06-24,-0.484875,0.82246,0.754143,1.110772
2022-06-25,0.92336,0.137434,-0.400239,0.555643
2022-06-26,-0.274613,0.11626,0.440875,-0.170577
2022-06-27,1.003394,-1.866451,-2.123851,0.600917
2022-06-28,-1.004421,-0.657466,-1.987638,0.525681


In [18]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-26,-0.170577,0.440875,0.11626,-0.274613
2022-06-25,0.555643,-0.400239,0.137434,0.92336
2022-06-24,1.110772,0.754143,0.82246,-0.484875
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-19,0.19225,0.517209,-1.52886,-1.079049


In [19]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-26,-0.170577,0.440875,0.11626,-0.274613
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093
2022-06-19,0.19225,0.517209,-1.52886,-1.079049
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421
2022-06-25,0.555643,-0.400239,0.137434,0.92336
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-24,1.110772,0.754143,0.82246,-0.484875


In [20]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [21]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [22]:
# selecting a single column returns a Series object
df["A"]

2022-06-19    0.192250
2022-06-20    0.389539
2022-06-21   -1.343990
2022-06-22    0.502105
2022-06-23    0.111816
2022-06-24    1.110772
2022-06-25    0.555643
2022-06-26   -0.170577
2022-06-27    0.600917
2022-06-28    0.525681
Freq: D, Name: A, dtype: float64

In [23]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093


## Selection by label

In [24]:
# selecting based on a label
df.loc[dates[0]]

A    0.192250
B    0.517209
C   -1.528860
D   -1.079049
Name: 2022-06-19 00:00:00, dtype: float64

In [25]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,0.19225,0.517209
2022-06-20,0.389539,-0.812363
2022-06-21,-1.34399,0.361734
2022-06-22,0.502105,-0.859073
2022-06-23,0.111816,-0.562476
2022-06-24,1.110772,0.754143
2022-06-25,0.555643,-0.400239
2022-06-26,-0.170577,0.440875
2022-06-27,0.600917,-2.123851
2022-06-28,0.525681,-1.987638


In [26]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    0.389539
B   -0.812363
C    1.761802
D   -0.637107
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [27]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

0.19225006621442584

In [28]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

0.19225006621442584

## Selection by position

In [29]:
df.iloc[2]

A   -1.343990
B    0.361734
C   -0.225792
D   -0.992741
Name: 2022-06-21 00:00:00, dtype: float64

In [30]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,0.389539,-0.812363
2022-06-21,-1.34399,0.361734
2022-06-22,0.502105,-0.859073
2022-06-23,0.111816,-0.562476


In [31]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,0.19225,-1.52886
2022-06-20,0.389539,1.761802
2022-06-21,-1.34399,-0.225792
2022-06-25,0.555643,0.137434


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-1.52886,0.517209,0.19225,-1.52886
2022-06-20,1.761802,-0.812363,0.389539,1.761802
2022-06-21,-0.225792,0.361734,-1.34399,-0.225792
2022-06-25,0.137434,-0.400239,0.555643,0.137434


In [33]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741


In [34]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-1.52886,-1.079049
2022-06-20,1.761802,-0.637107
2022-06-21,-0.225792,-0.992741
2022-06-22,0.573846,0.031882
2022-06-23,-0.916386,-0.366093
2022-06-24,0.82246,-0.484875
2022-06-25,0.137434,0.92336
2022-06-26,0.11626,-0.274613
2022-06-27,-1.866451,1.003394
2022-06-28,-0.657466,-1.004421


In [35]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,0.19225,0.517209,-1.52886,-1.079049
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093
2022-06-24,1.110772,0.754143,0.82246,-0.484875
2022-06-25,0.555643,-0.400239,0.137434,0.92336
2022-06-26,-0.170577,0.440875,0.11626,-0.274613
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421


In [36]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

1.7618015491288634

In [37]:
#
# method two: use iat
df.iat[1, 2]

1.7618015491288634

## Boolean Indexing

In [38]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.19225,0.517209,-1.52886,-1.079049
2022-06-20,0.389539,-0.812363,1.761802,-0.637107
2022-06-22,0.502105,-0.859073,0.573846,0.031882
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093
2022-06-24,1.110772,0.754143,0.82246,-0.484875
2022-06-25,0.555643,-0.400239,0.137434,0.92336
2022-06-27,0.600917,-2.123851,-1.866451,1.003394
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421


In [39]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.19225,0.517209,,
2022-06-20,0.389539,,1.761802,
2022-06-21,,0.361734,,
2022-06-22,0.502105,,0.573846,0.031882
2022-06-23,0.111816,,,
2022-06-24,1.110772,0.754143,0.82246,
2022-06-25,0.555643,,0.137434,0.92336
2022-06-26,,0.440875,0.11626,
2022-06-27,0.600917,,,1.003394
2022-06-28,0.525681,,,


In [40]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,one
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,two
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,three
2022-06-22,0.502105,-0.859073,0.573846,0.031882,four
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093,two
2022-06-24,1.110772,0.754143,0.82246,-0.484875,five
2022-06-25,0.555643,-0.400239,0.137434,0.92336,one
2022-06-26,-0.170577,0.440875,0.11626,-0.274613,two
2022-06-27,0.600917,-2.123851,-1.866451,1.003394,three
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421,four


In [41]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,two
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093,two
2022-06-24,1.110772,0.754143,0.82246,-0.484875,five
2022-06-26,-0.170577,0.440875,0.11626,-0.274613,two


## Setting values

In [42]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [43]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,12
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093,15
2022-06-24,1.110772,0.754143,0.82246,-0.484875,16
2022-06-25,0.555643,-0.400239,0.137434,0.92336,17
2022-06-26,-0.170577,0.440875,0.11626,-0.274613,18
2022-06-27,0.600917,-2.123851,-1.866451,1.003394,19
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421,20


In [44]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,12
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093,15
2022-06-24,1.110772,0.754143,0.82246,-0.484875,16
2022-06-25,0.555643,-0.400239,0.137434,0.92336,17
2022-06-26,-0.170577,0.440875,0.11626,-0.274613,18
2022-06-27,0.600917,-2.123851,-1.866451,1.003394,19
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421,20


In [45]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-1.52886,-1.079049,11
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,12
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14
2022-06-23,0.111816,-0.562476,-0.916386,-0.366093,15
2022-06-24,1.110772,0.754143,0.82246,-0.484875,16
2022-06-25,0.555643,-0.400239,0.137434,0.92336,17
2022-06-26,-0.170577,0.440875,0.11626,-0.274613,18
2022-06-27,0.600917,-2.123851,-1.866451,1.003394,19
2022-06-28,0.525681,-1.987638,-0.657466,-1.004421,20


In [46]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-1.52886,5,11
2022-06-20,0.389539,-0.812363,1.761802,5,12
2022-06-21,-1.34399,0.361734,-0.225792,5,13
2022-06-22,0.502105,-0.859073,0.573846,5,14
2022-06-23,0.111816,-0.562476,-0.916386,5,15
2022-06-24,1.110772,0.754143,0.82246,5,16
2022-06-25,0.555643,-0.400239,0.137434,5,17
2022-06-26,-0.170577,0.440875,0.11626,5,18
2022-06-27,0.600917,-2.123851,-1.866451,5,19
2022-06-28,0.525681,-1.987638,-0.657466,5,20


In [47]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-1.52886,-5,-11
2022-06-20,-0.389539,-0.812363,-1.761802,-5,-12
2022-06-21,-1.34399,-0.361734,-0.225792,-5,-13
2022-06-22,-0.502105,-0.859073,-0.573846,-5,-14
2022-06-23,-0.111816,-0.562476,-0.916386,-5,-15
2022-06-24,-1.110772,-0.754143,-0.82246,-5,-16
2022-06-25,-0.555643,-0.400239,-0.137434,-5,-17
2022-06-26,-0.170577,-0.440875,-0.11626,-5,-18
2022-06-27,-0.600917,-2.123851,-1.866451,-5,-19
2022-06-28,-0.525681,-1.987638,-0.657466,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [55]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,
2022-06-20,0.389539,-0.812363,1.761802,-0.637107,12,
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,


In [64]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,
2022-06-20,,-0.812363,1.761802,-0.637107,12,0.0
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,0.0
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,0.0


In [65]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [66]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [68]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,
2022-06-20,,-0.812363,1.761802,-0.637107,12,0.0
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,0.0
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,0.0


In [73]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,
2022-06-20,,-0.812363,1.761802,-0.637107,12,0.0
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,0.0
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,0.0


In [74]:
# drop rows/columns that have missing data
df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,
2022-06-20,,-0.812363,1.761802,-0.637107,12,0.0
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,0.0
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,0.0


In [77]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.19225,0.517209,-1.52886,-1.079049,11,3141.592654
2022-06-20,3141.592654,-0.812363,1.761802,-0.637107,12,0.0
2022-06-21,-1.34399,0.361734,-0.225792,-0.992741,13,0.0
2022-06-22,0.502105,-0.859073,0.573846,0.031882,14,0.0
