# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,0.086238,0.168453,-0.925299,-0.632629
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584
2022-06-24,1.21304,-0.906392,2.198223,1.081295
2022-06-25,-0.098124,-0.449159,0.80124,1.454664
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,0.086238,0.168453,-0.925299,-0.632629
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[ 0.08623819,  0.16845257, -0.92529926, -0.63262905],
       [ 1.9486896 , -0.76421484, -0.52985238, -0.29354083],
       [ 0.52547221, -0.21700211, -0.12626097, -1.98542475],
       [ 0.17574508, -1.82564753,  2.4090117 , -0.46886769],
       [-0.30123284, -1.13633007,  0.1071116 , -1.33058399],
       [ 1.21303994, -0.90639216,  2.19822265,  1.08129459],
       [-0.09812449, -0.44915906,  0.80123962,  1.45466419],
       [ 0.34082542,  0.07324525, -0.40749321, -1.21341842],
       [ 0.80932536,  1.37341479,  0.33154922,  0.09923913],
       [-2.10143108,  2.36243313, -1.50339858, -0.98364408]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.259855,-0.13212,0.235483,-0.427291
std,1.064352,1.229507,1.265943,1.071232
min,-2.101431,-1.825648,-1.503399,-1.985425
25%,-0.052034,-0.870848,-0.499263,-1.155975
50%,0.258285,-0.333081,-0.009575,-0.550748
75%,0.738362,0.144651,0.683817,0.001044
max,1.94869,2.362433,2.409012,1.454664


In [15]:
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [16]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,0.086238,1.94869,0.525472,0.175745,-0.301233,1.21304,-0.098124,0.340825,0.809325,-2.101431
B,0.168453,-0.764215,-0.217002,-1.825648,-1.13633,-0.906392,-0.449159,0.073245,1.373415,2.362433
C,-0.925299,-0.529852,-0.126261,2.409012,0.107112,2.198223,0.80124,-0.407493,0.331549,-1.503399
D,-0.632629,-0.293541,-1.985425,-0.468868,-1.330584,1.081295,1.454664,-1.213418,0.099239,-0.983644


In [17]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,-0.632629,-0.925299,0.168453,0.086238
2022-06-20,-0.293541,-0.529852,-0.764215,1.94869
2022-06-21,-1.985425,-0.126261,-0.217002,0.525472
2022-06-22,-0.468868,2.409012,-1.825648,0.175745
2022-06-23,-1.330584,0.107112,-1.13633,-0.301233
2022-06-24,1.081295,2.198223,-0.906392,1.21304
2022-06-25,1.454664,0.80124,-0.449159,-0.098124
2022-06-26,-1.213418,-0.407493,0.073245,0.340825
2022-06-27,0.099239,0.331549,1.373415,0.809325
2022-06-28,-0.983644,-1.503399,2.362433,-2.101431


In [18]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-25,-0.098124,-0.449159,0.80124,1.454664
2022-06-24,1.21304,-0.906392,2.198223,1.081295
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-19,0.086238,0.168453,-0.925299,-0.632629


In [19]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584
2022-06-25,-0.098124,-0.449159,0.80124,1.454664
2022-06-19,0.086238,0.168453,-0.925299,-0.632629
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-24,1.21304,-0.906392,2.198223,1.081295
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541


In [20]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [21]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [22]:
# selecting a single column returns a Series object
df["A"]

2022-06-19    0.086238
2022-06-20    1.948690
2022-06-21    0.525472
2022-06-22    0.175745
2022-06-23   -0.301233
2022-06-24    1.213040
2022-06-25   -0.098124
2022-06-26    0.340825
2022-06-27    0.809325
2022-06-28   -2.101431
Freq: D, Name: A, dtype: float64

In [23]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584


## Selection by label

In [24]:
# selecting based on a label
df.loc[dates[0]]

A    0.086238
B    0.168453
C   -0.925299
D   -0.632629
Name: 2022-06-19 00:00:00, dtype: float64

In [25]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,0.086238,0.168453
2022-06-20,1.94869,-0.764215
2022-06-21,0.525472,-0.217002
2022-06-22,0.175745,-1.825648
2022-06-23,-0.301233,-1.13633
2022-06-24,1.21304,-0.906392
2022-06-25,-0.098124,-0.449159
2022-06-26,0.340825,0.073245
2022-06-27,0.809325,1.373415
2022-06-28,-2.101431,2.362433


In [26]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    1.948690
B   -0.764215
C   -0.529852
D   -0.293541
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [27]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

0.08623818904114744

In [28]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

0.08623818904114744

## Selection by position

In [29]:
df.iloc[2]

A    0.525472
B   -0.217002
C   -0.126261
D   -1.985425
Name: 2022-06-21 00:00:00, dtype: float64

In [30]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,1.94869,-0.764215
2022-06-21,0.525472,-0.217002
2022-06-22,0.175745,-1.825648
2022-06-23,-0.301233,-1.13633


In [31]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,0.086238,-0.925299
2022-06-20,1.94869,-0.529852
2022-06-21,0.525472,-0.126261
2022-06-25,-0.098124,0.80124


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.925299,0.168453,0.086238,-0.925299
2022-06-20,-0.529852,-0.764215,1.94869,-0.529852
2022-06-21,-0.126261,-0.217002,0.525472,-0.126261
2022-06-25,0.80124,-0.449159,-0.098124,0.80124


In [33]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425


In [34]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-0.925299,-0.632629
2022-06-20,-0.529852,-0.293541
2022-06-21,-0.126261,-1.985425
2022-06-22,2.409012,-0.468868
2022-06-23,0.107112,-1.330584
2022-06-24,2.198223,1.081295
2022-06-25,0.80124,1.454664
2022-06-26,-0.407493,-1.213418
2022-06-27,0.331549,0.099239
2022-06-28,-1.503399,-0.983644


In [35]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,0.086238,0.168453,-0.925299,-0.632629
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584
2022-06-24,1.21304,-0.906392,2.198223,1.081295
2022-06-25,-0.098124,-0.449159,0.80124,1.454664
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644


In [36]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

-0.5298523790906997

In [37]:
#
# method two: use iat
df.iat[1, 2]

-0.5298523790906997

## Boolean Indexing

In [38]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.086238,0.168453,-0.925299,-0.632629
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425
2022-06-22,0.175745,-1.825648,2.409012,-0.468868
2022-06-24,1.21304,-0.906392,2.198223,1.081295
2022-06-26,0.340825,0.073245,-0.407493,-1.213418
2022-06-27,0.809325,1.373415,0.331549,0.099239


In [39]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.086238,0.168453,,
2022-06-20,1.94869,,,
2022-06-21,0.525472,,,
2022-06-22,0.175745,,2.409012,
2022-06-23,,,0.107112,
2022-06-24,1.21304,,2.198223,1.081295
2022-06-25,,,0.80124,1.454664
2022-06-26,0.340825,0.073245,,
2022-06-27,0.809325,1.373415,0.331549,0.099239
2022-06-28,,2.362433,,


In [40]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,one
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,two
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,three
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,four
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584,two
2022-06-24,1.21304,-0.906392,2.198223,1.081295,five
2022-06-25,-0.098124,-0.449159,0.80124,1.454664,one
2022-06-26,0.340825,0.073245,-0.407493,-1.213418,two
2022-06-27,0.809325,1.373415,0.331549,0.099239,three
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644,four


In [41]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,two
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584,two
2022-06-24,1.21304,-0.906392,2.198223,1.081295,five
2022-06-26,0.340825,0.073245,-0.407493,-1.213418,two


## Setting values

In [42]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [43]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,12
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584,15
2022-06-24,1.21304,-0.906392,2.198223,1.081295,16
2022-06-25,-0.098124,-0.449159,0.80124,1.454664,17
2022-06-26,0.340825,0.073245,-0.407493,-1.213418,18
2022-06-27,0.809325,1.373415,0.331549,0.099239,19
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644,20


In [44]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,12
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584,15
2022-06-24,1.21304,-0.906392,2.198223,1.081295,16
2022-06-25,-0.098124,-0.449159,0.80124,1.454664,17
2022-06-26,0.340825,0.073245,-0.407493,-1.213418,18
2022-06-27,0.809325,1.373415,0.331549,0.099239,19
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644,20


In [45]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.925299,-0.632629,11
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,12
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14
2022-06-23,-0.301233,-1.13633,0.107112,-1.330584,15
2022-06-24,1.21304,-0.906392,2.198223,1.081295,16
2022-06-25,-0.098124,-0.449159,0.80124,1.454664,17
2022-06-26,0.340825,0.073245,-0.407493,-1.213418,18
2022-06-27,0.809325,1.373415,0.331549,0.099239,19
2022-06-28,-2.101431,2.362433,-1.503399,-0.983644,20


In [46]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.925299,5,11
2022-06-20,1.94869,-0.764215,-0.529852,5,12
2022-06-21,0.525472,-0.217002,-0.126261,5,13
2022-06-22,0.175745,-1.825648,2.409012,5,14
2022-06-23,-0.301233,-1.13633,0.107112,5,15
2022-06-24,1.21304,-0.906392,2.198223,5,16
2022-06-25,-0.098124,-0.449159,0.80124,5,17
2022-06-26,0.340825,0.073245,-0.407493,5,18
2022-06-27,0.809325,1.373415,0.331549,5,19
2022-06-28,-2.101431,2.362433,-1.503399,5,20


In [47]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.925299,-5,-11
2022-06-20,-1.94869,-0.764215,-0.529852,-5,-12
2022-06-21,-0.525472,-0.217002,-0.126261,-5,-13
2022-06-22,-0.175745,-1.825648,-2.409012,-5,-14
2022-06-23,-0.301233,-1.13633,-0.107112,-5,-15
2022-06-24,-1.21304,-0.906392,-2.198223,-5,-16
2022-06-25,-0.098124,-0.449159,-0.80124,-5,-17
2022-06-26,-0.340825,-0.073245,-0.407493,-5,-18
2022-06-27,-0.809325,-1.373415,-0.331549,-5,-19
2022-06-28,-2.101431,-2.362433,-1.503399,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [48]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,1.94869,-0.764215,-0.529852,-0.293541,12,
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,


In [49]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [50]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [51]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [52]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [53]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [56]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
nona = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [57]:
# all rows/cols with missing data stripped
nona

Unnamed: 0,A,B,C,D,F,G
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [58]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,
2022-06-20,,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [60]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0


In [55]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.086238,0.168453,-0.925299,-0.632629,11,3141.592654
2022-06-20,3141.592654,-0.764215,-0.529852,-0.293541,12,0.0
2022-06-21,0.525472,-0.217002,-0.126261,-1.985425,13,0.0
2022-06-22,0.175745,-1.825648,2.409012,-0.468868,14,0.0
