# Setup

This is following [10 minutes to pandas](https://pandas.pydata.org/docs/user_guide/10min.html) for pandas 1.4.2

In [1]:
import numpy as np
import pandas as pd

# Object creation

In [2]:
# Series - pass a list to pd.Series
s1 = pd.Series([1, 2, 3, np.nan, 5])

In [3]:
s1

0    1.0
1    2.0
2    3.0
3    NaN
4    5.0
dtype: float64

In [4]:
# create an index to use in the dataframe
# default frequency is D (day), so creates a DatetimeIndex of 10 days
dates = pd.date_range('20220619', periods=10)
dates

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

just in case you are wondering, here's [all the "offset aliases"](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) for Datetime

In [5]:
# Dataframe - pass a Numpy array
# create a 10 row, 4 col random number array, index by dates, give some column names
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2022-06-19,0.308022,1.720912,-0.609598,0.596226
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-23,1.466746,1.615641,0.572107,-1.913607
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335
2022-06-25,-0.3171,0.219377,0.056721,-0.801607
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-27,-0.843303,0.777268,0.086186,1.215642
2022-06-28,1.543246,1.758878,0.755212,0.036198


In [6]:
# Create a dataframe by passing a dictionary of objects
# where each object can be converted into a series-like structure
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20220619"),
        "C": pd.Series(1, index=list(range(6)), dtype="float32"),
        "D": np.array([3]*6, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train", "test", "train"]),
        "F": list("foofoo"),
        "G": "foo"
    }
)

df2

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


In [7]:
# the datatype of each of the columns would be different
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

# Viewing Data

In [8]:
# top of the dataframe
df.head()

Unnamed: 0,A,B,C,D
2022-06-19,0.308022,1.720912,-0.609598,0.596226
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-23,1.466746,1.615641,0.572107,-1.913607


In [9]:
# bottom 3 records of the dataframe
df.tail(3)

Unnamed: 0,A,B,C,D
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-27,-0.843303,0.777268,0.086186,1.215642
2022-06-28,1.543246,1.758878,0.755212,0.036198


In [10]:
# index of the df
df.index

DatetimeIndex(['2022-06-19', '2022-06-20', '2022-06-21', '2022-06-22',
               '2022-06-23', '2022-06-24', '2022-06-25', '2022-06-26',
               '2022-06-27', '2022-06-28'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# columns of the df
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
# .to_numpy gives a NumPy representation of the dataframe
# this is expensive if all columns are of different data type
df.to_numpy()

array([[ 0.30802163,  1.72091154, -0.60959827,  0.59622553],
       [ 1.80824086,  1.84879823, -1.48129211,  0.23826687],
       [-1.06177168, -0.14896953,  0.21950324,  1.19669675],
       [ 2.34382363, -2.82051251, -0.39391537,  0.00490181],
       [ 1.46674623,  1.61564118,  0.57210693, -1.91360678],
       [-1.25668328,  0.28845598, -1.36613358, -1.20335018],
       [-0.31709994,  0.21937666,  0.05672068, -0.80160706],
       [ 1.23174574, -0.09652625,  0.46564767,  0.85702375],
       [-0.8433027 ,  0.77726825,  0.08618594,  1.21564183],
       [ 1.54324621,  1.75887791,  0.75521241,  0.03619751]])

In [13]:
df2.to_numpy()

array([[1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'f',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'test', 'o',
        'foo'],
       [1.0, Timestamp('2022-06-19 00:00:00'), 1.0, 3, 'train', 'o',
        'foo']], dtype=object)

In [14]:
# quick summary stats
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.522297,0.516332,-0.169556,0.022639
std,1.320718,1.419696,0.779825,1.043783
min,-1.256683,-2.820513,-1.481292,-1.913607
25%,-0.711752,-0.017551,-0.555678,-0.59998
50%,0.769884,0.532862,0.071453,0.137232
75%,1.524121,1.694594,0.404112,0.791824
max,2.343824,1.848798,0.755212,1.215642


1. **count** = Count number of non-NA/null observations
1. **max** = Maximum of the values in the object
1. **min** = Minimum of the values in the object
1. **mean** = Mean of the values
1. **std** = Standard deviation of the observations
1. **25%** = Default lower percentile
1. **50%** = 50 percentile - same as the median 
1. **75%** = Default upper percentile

In [15]:
# change the percentiles
df.describe(percentiles=[.1, .5, .9])

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.522297,0.516332,-0.169556,0.022639
std,1.320718,1.419696,0.779825,1.043783
min,-1.256683,-2.820513,-1.481292,-1.913607
10%,-1.081263,-0.416124,-1.377649,-1.274376
50%,0.769884,0.532862,0.071453,0.137232
90%,1.861799,1.76787,0.590417,1.198591
max,2.343824,1.848798,0.755212,1.215642


In [16]:
df2.describe()

Unnamed: 0,A,C,D
count,6.0,6.0,6.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [17]:
# transpose the data
# turn rows to columns and vice versa
df.T

Unnamed: 0,2022-06-19,2022-06-20,2022-06-21,2022-06-22,2022-06-23,2022-06-24,2022-06-25,2022-06-26,2022-06-27,2022-06-28
A,0.308022,1.808241,-1.061772,2.343824,1.466746,-1.256683,-0.3171,1.231746,-0.843303,1.543246
B,1.720912,1.848798,-0.14897,-2.820513,1.615641,0.288456,0.219377,-0.096526,0.777268,1.758878
C,-0.609598,-1.481292,0.219503,-0.393915,0.572107,-1.366134,0.056721,0.465648,0.086186,0.755212
D,0.596226,0.238267,1.196697,0.004902,-1.913607,-1.20335,-0.801607,0.857024,1.215642,0.036198


In [18]:
# sort along the axis - 1 = horizontal
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2022-06-19,0.596226,-0.609598,1.720912,0.308022
2022-06-20,0.238267,-1.481292,1.848798,1.808241
2022-06-21,1.196697,0.219503,-0.14897,-1.061772
2022-06-22,0.004902,-0.393915,-2.820513,2.343824
2022-06-23,-1.913607,0.572107,1.615641,1.466746
2022-06-24,-1.20335,-1.366134,0.288456,-1.256683
2022-06-25,-0.801607,0.056721,0.219377,-0.3171
2022-06-26,0.857024,0.465648,-0.096526,1.231746
2022-06-27,1.215642,0.086186,0.777268,-0.843303
2022-06-28,0.036198,0.755212,1.758878,1.543246


In [19]:
# sort along the axis - 0 = vertical
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2022-06-28,1.543246,1.758878,0.755212,0.036198
2022-06-27,-0.843303,0.777268,0.086186,1.215642
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-25,-0.3171,0.219377,0.056721,-0.801607
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335
2022-06-23,1.466746,1.615641,0.572107,-1.913607
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-19,0.308022,1.720912,-0.609598,0.596226


In [20]:
# sort ascending by values in a column
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-27,-0.843303,0.777268,0.086186,1.215642
2022-06-25,-0.3171,0.219377,0.056721,-0.801607
2022-06-19,0.308022,1.720912,-0.609598,0.596226
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-23,1.466746,1.615641,0.572107,-1.913607
2022-06-28,1.543246,1.758878,0.755212,0.036198
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-22,2.343824,-2.820513,-0.393915,0.004902


In [21]:
# sort by non-numerical values
df2.sort_values(by="F", ascending=False)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2022-06-19,1.0,3,train,o,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo


In [22]:
# sort by two or more columns
df2.sort_values(by=["F", "E"])

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2022-06-19,1.0,3,test,f,foo
3,1.0,2022-06-19,1.0,3,train,f,foo
2,1.0,2022-06-19,1.0,3,test,o,foo
4,1.0,2022-06-19,1.0,3,test,o,foo
1,1.0,2022-06-19,1.0,3,train,o,foo
5,1.0,2022-06-19,1.0,3,train,o,foo


# Selection

For production prefer the following instead of other data access methods (typical python methods like ["col"] or [a:b] slices etc.):

```.at, .iat, .loc and .iloc.```

## Getting

In [23]:
# selecting a single column returns a Series object
df["A"]

2022-06-19    0.308022
2022-06-20    1.808241
2022-06-21   -1.061772
2022-06-22    2.343824
2022-06-23    1.466746
2022-06-24   -1.256683
2022-06-25   -0.317100
2022-06-26    1.231746
2022-06-27   -0.843303
2022-06-28    1.543246
Freq: D, Name: A, dtype: float64

In [24]:
# selecting a slice
df[1:5]

Unnamed: 0,A,B,C,D
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-23,1.466746,1.615641,0.572107,-1.913607


## Selection by label

In [25]:
# selecting based on a label
df.loc[dates[0]]

A    0.308022
B    1.720912
C   -0.609598
D    0.596226
Name: 2022-06-19 00:00:00, dtype: float64

In [26]:
# select on a multi-axis by lable
# I honestly do not know what the comment above means...help!
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2022-06-19,0.308022,1.720912
2022-06-20,1.808241,1.848798
2022-06-21,-1.061772,-0.14897
2022-06-22,2.343824,-2.820513
2022-06-23,1.466746,1.615641
2022-06-24,-1.256683,0.288456
2022-06-25,-0.3171,0.219377
2022-06-26,1.231746,-0.096526
2022-06-27,-0.843303,0.777268
2022-06-28,1.543246,1.758878


In [27]:
# specific index value results in reduction of dimensions
res = df.loc["2022-06-20"]
print(res)
print("res.shape = ", res.shape, " vs. df.shape = ", df.shape)

A    1.808241
B    1.848798
C   -1.481292
D    0.238267
Name: 2022-06-20 00:00:00, dtype: float64
res.shape =  (4,)  vs. df.shape =  (10, 4)


In [28]:
# get to a specific scalar:
#
# method one
df.loc[dates[0], "A"]

0.3080216302131427

In [29]:
#
# method two (slightly faster than method one)
df.at[dates[0], "A"]

0.3080216302131427

## Selection by position

In [30]:
df.iloc[2]

A   -1.061772
B   -0.148970
C    0.219503
D    1.196697
Name: 2022-06-21 00:00:00, dtype: float64

In [31]:
# slices - similar to NumPy / Python - [row:slice, col:slice]
df.iloc[1:5, 0:2]

Unnamed: 0,A,B
2022-06-20,1.808241,1.848798
2022-06-21,-1.061772,-0.14897
2022-06-22,2.343824,-2.820513
2022-06-23,1.466746,1.615641


In [32]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
df.iloc[[0, 1, 2, 6], [0, 2]]

Unnamed: 0,A,C
2022-06-19,0.308022,-0.609598
2022-06-20,1.808241,-1.481292
2022-06-21,-1.061772,0.219503
2022-06-25,-0.3171,0.056721


In [33]:
# by list of locations - similar to NumPy / Python - [[list of rows], [list of cols]]
# change the order of columns, repeact a column
df.iloc[[0, 1, 2, 6], [2, 1, 0, 2]]

Unnamed: 0,C,B,A,C.1
2022-06-19,-0.609598,1.720912,0.308022,-0.609598
2022-06-20,-1.481292,1.848798,1.808241,-1.481292
2022-06-21,0.219503,-0.14897,-1.061772,0.219503
2022-06-25,0.056721,0.219377,-0.3171,0.056721


In [34]:
# slice rows explicitly, keep all columns
df.iloc[[1, 2], :]

Unnamed: 0,A,B,C,D
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-21,-1.061772,-0.14897,0.219503,1.196697


In [35]:
# slice columns, keep all rows
df.iloc[:, [2, 3]]

Unnamed: 0,C,D
2022-06-19,-0.609598,0.596226
2022-06-20,-1.481292,0.238267
2022-06-21,0.219503,1.196697
2022-06-22,-0.393915,0.004902
2022-06-23,0.572107,-1.913607
2022-06-24,-1.366134,-1.20335
2022-06-25,0.056721,-0.801607
2022-06-26,0.465648,0.857024
2022-06-27,0.086186,1.215642
2022-06-28,0.755212,0.036198


In [36]:
# everything, because you can
df.iloc[:, :]

Unnamed: 0,A,B,C,D
2022-06-19,0.308022,1.720912,-0.609598,0.596226
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-21,-1.061772,-0.14897,0.219503,1.196697
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-23,1.466746,1.615641,0.572107,-1.913607
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335
2022-06-25,-0.3171,0.219377,0.056721,-0.801607
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-27,-0.843303,0.777268,0.086186,1.215642
2022-06-28,1.543246,1.758878,0.755212,0.036198


In [37]:
# get to a scalar (2 methods, just like before)
#
# method one: use iloc
df.iloc[1, 2]

-1.4812921121087055

In [38]:
#
# method two: use iat
df.iat[1, 2]

-1.4812921121087055

## Boolean Indexing

In [39]:
# use a value found in a single col to get data
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.308022,1.720912,-0.609598,0.596226
2022-06-20,1.808241,1.848798,-1.481292,0.238267
2022-06-22,2.343824,-2.820513,-0.393915,0.004902
2022-06-23,1.466746,1.615641,0.572107,-1.913607
2022-06-26,1.231746,-0.096526,0.465648,0.857024
2022-06-28,1.543246,1.758878,0.755212,0.036198


In [40]:
# boolean across the entire DF - vals that don't match go NaN
df[df > 0]

Unnamed: 0,A,B,C,D
2022-06-19,0.308022,1.720912,,0.596226
2022-06-20,1.808241,1.848798,,0.238267
2022-06-21,,,0.219503,1.196697
2022-06-22,2.343824,,,0.004902
2022-06-23,1.466746,1.615641,0.572107,
2022-06-24,,0.288456,,
2022-06-25,,0.219377,0.056721,
2022-06-26,1.231746,,0.465648,0.857024
2022-06-27,,0.777268,0.086186,1.215642
2022-06-28,1.543246,1.758878,0.755212,0.036198


In [41]:
# add another column
df11 = df.copy()
df11["E"] = ["one", "two", "three", "four",
             "two", "five", "one", "two", "three", "four"]
df11

Unnamed: 0,A,B,C,D,E
2022-06-19,0.308022,1.720912,-0.609598,0.596226,one
2022-06-20,1.808241,1.848798,-1.481292,0.238267,two
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,three
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,four
2022-06-23,1.466746,1.615641,0.572107,-1.913607,two
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335,five
2022-06-25,-0.3171,0.219377,0.056721,-0.801607,one
2022-06-26,1.231746,-0.096526,0.465648,0.857024,two
2022-06-27,-0.843303,0.777268,0.086186,1.215642,three
2022-06-28,1.543246,1.758878,0.755212,0.036198,four


In [42]:
# the isin() query - basically the in clause
df11[df11["E"].isin(["two", "five"])]

Unnamed: 0,A,B,C,D,E
2022-06-20,1.808241,1.848798,-1.481292,0.238267,two
2022-06-23,1.466746,1.615641,0.572107,-1.913607,two
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335,five
2022-06-26,1.231746,-0.096526,0.465648,0.857024,two


## Setting values

In [43]:
# matching indexes auto-aligns values
s1 = pd.Series(range(11, 21), index=pd.date_range('20220619', periods=10))
s1

2022-06-19    11
2022-06-20    12
2022-06-21    13
2022-06-22    14
2022-06-23    15
2022-06-24    16
2022-06-25    17
2022-06-26    18
2022-06-27    19
2022-06-28    20
Freq: D, dtype: int64

In [44]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11
2022-06-20,1.808241,1.848798,-1.481292,0.238267,12
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14
2022-06-23,1.466746,1.615641,0.572107,-1.913607,15
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335,16
2022-06-25,-0.3171,0.219377,0.056721,-0.801607,17
2022-06-26,1.231746,-0.096526,0.465648,0.857024,18
2022-06-27,-0.843303,0.777268,0.086186,1.215642,19
2022-06-28,1.543246,1.758878,0.755212,0.036198,20


In [45]:
# setting values by label and position
# first let's make a quick copy
df12 = df.copy()
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11
2022-06-20,1.808241,1.848798,-1.481292,0.238267,12
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14
2022-06-23,1.466746,1.615641,0.572107,-1.913607,15
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335,16
2022-06-25,-0.3171,0.219377,0.056721,-0.801607,17
2022-06-26,1.231746,-0.096526,0.465648,0.857024,18
2022-06-27,-0.843303,0.777268,0.086186,1.215642,19
2022-06-28,1.543246,1.758878,0.755212,0.036198,20


In [46]:
# set by label
df12.at[dates[0], "A"] = 0
# set by position
df12.iat[0, 1] = 0
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.609598,0.596226,11
2022-06-20,1.808241,1.848798,-1.481292,0.238267,12
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14
2022-06-23,1.466746,1.615641,0.572107,-1.913607,15
2022-06-24,-1.256683,0.288456,-1.366134,-1.20335,16
2022-06-25,-0.3171,0.219377,0.056721,-0.801607,17
2022-06-26,1.231746,-0.096526,0.465648,0.857024,18
2022-06-27,-0.843303,0.777268,0.086186,1.215642,19
2022-06-28,1.543246,1.758878,0.755212,0.036198,20


In [47]:
# kinda bigger replacement
df12.loc[:, "D"] = np.array([5]*len(df))
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.609598,5,11
2022-06-20,1.808241,1.848798,-1.481292,5,12
2022-06-21,-1.061772,-0.14897,0.219503,5,13
2022-06-22,2.343824,-2.820513,-0.393915,5,14
2022-06-23,1.466746,1.615641,0.572107,5,15
2022-06-24,-1.256683,0.288456,-1.366134,5,16
2022-06-25,-0.3171,0.219377,0.056721,5,17
2022-06-26,1.231746,-0.096526,0.465648,5,18
2022-06-27,-0.843303,0.777268,0.086186,5,19
2022-06-28,1.543246,1.758878,0.755212,5,20


In [48]:
# setting values using a boolean selection (aka where clause)
df12[df12 > 0] = -df12
df12

Unnamed: 0,A,B,C,D,F
2022-06-19,0.0,0.0,-0.609598,-5,-11
2022-06-20,-1.808241,-1.848798,-1.481292,-5,-12
2022-06-21,-1.061772,-0.14897,-0.219503,-5,-13
2022-06-22,-2.343824,-2.820513,-0.393915,-5,-14
2022-06-23,-1.466746,-1.615641,-0.572107,-5,-15
2022-06-24,-1.256683,-0.288456,-1.366134,-5,-16
2022-06-25,-0.3171,-0.219377,-0.056721,-5,-17
2022-06-26,-1.231746,-0.096526,-0.465648,-5,-18
2022-06-27,-0.843303,-0.777268,-0.086186,-5,-19
2022-06-28,-1.543246,-1.758878,-0.755212,-5,-20


# Missing Data

### _reindex_
change/add/delete index on a specified axis, returns a new dataframe

In [49]:
df13 = df.reindex(index=dates[0:4], columns=list(df.columns)+["G"])
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,
2022-06-20,1.808241,1.848798,-1.481292,0.238267,12,
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,


### handling missing data
1. _```np.nan```_
1. _```pandas.isna()```_
1. ```df.dropna()```
1. ```df.fillna()```

In [50]:
# missing data in pandas is np.nan
df13.iat[1, 0] = np.nan
df13.iloc[1:, 5] = np.random.randint(1)
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,
2022-06-20,,1.848798,-1.481292,0.238267,12,0.0
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [51]:
# get a boolean mask where values are NaN
df131 = pd.isna(df13)
df131

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [52]:
# or just
pd.isna(df13)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,False,False,False,False,False,True
2022-06-20,True,False,False,False,False,False
2022-06-21,False,False,False,False,False,False
2022-06-22,False,False,False,False,False,False


In [53]:
# the original is still there
df13

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,
2022-06-20,,1.848798,-1.481292,0.238267,12,0.0
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [54]:
# we are going to drop / replace values now, let's make a couple of copies of the dataframe
df132 = df13.copy()
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,
2022-06-20,,1.848798,-1.481292,0.238267,12,0.0
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [60]:
# drop rows/columns that have missing data
# by default it returns a new dataframe, you may want to specify inplace=True for modifying current dataframe:
df_no_na = df132.dropna(how="any")
# how=‘any’ : If any NA values are present, drop that row or column.
# how=‘all’ : If all values are NA, drop that row or column.

In [61]:
# all rows/cols with missing data stripped
df_no_na

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [57]:
# original still intact
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,
2022-06-20,,1.848798,-1.481292,0.238267,12,0.0
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [58]:
# drop missing data from original
df132.dropna(how="any", inplace=True)
df132

Unnamed: 0,A,B,C,D,F,G
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


In [59]:
# fill missing data
df133 = df13.copy()
df133.fillna(np.pi*1000)

Unnamed: 0,A,B,C,D,F,G
2022-06-19,0.308022,1.720912,-0.609598,0.596226,11,3141.592654
2022-06-20,3141.592654,1.848798,-1.481292,0.238267,12,0.0
2022-06-21,-1.061772,-0.14897,0.219503,1.196697,13,0.0
2022-06-22,2.343824,-2.820513,-0.393915,0.004902,14,0.0


# Operations on data

## Stats

Operations in general exclude missing data

In [63]:
# arithmetic mean, for each column (axis = 0)
df.mean()

A     0.522297
B     0.516332
C    -0.169556
D     0.022639
F    15.500000
dtype: float64

In [64]:
# mean across a row (axis = 1)
df.mean(1)

2022-06-19    2.603112
2022-06-20    2.882803
2022-06-21    2.641092
2022-06-22    2.626860
2022-06-23    3.348178
2022-06-24    2.492458
2022-06-25    3.231478
2022-06-26    4.091578
2022-06-27    4.047159
2022-06-28    4.818707
Freq: D, dtype: float64