In [1]:
import numpy as np
import pandas as pd

### Object Creation

Create a *Series* by passeing a list of values, letting pandas create a default integer index

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Create a *DataFrame* by passing a numpy array, with datetime index and columns

In [3]:
dates = pd.date_range("20190101",periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))
df

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921
2019-01-04,-1.524455,0.319465,-0.592027,1.015569
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095
2019-01-06,1.839047,1.829464,-0.302522,1.089911


Create a *DataFrame* by passing a dict

In [4]:
df_ = pd.DataFrame({"A":1.0,
                    "B":pd.Timestamp("20190920"),
                    "C":pd.Series(1,index=list(range(4)),dtype="float32"),
                    "D":np.array([3]*4,dtype="int32"),
                    "E":pd.Categorical(["test","train","test","train"]),
                    "F":"foo"
                   })
print(df_.dtypes)
df_

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


Unnamed: 0,A,B,C,D,E,F
0,1.0,2019-09-20,1.0,3,test,foo
1,1.0,2019-09-20,1.0,3,train,foo
2,1.0,2019-09-20,1.0,3,test,foo
3,1.0,2019-09-20,1.0,3,train,foo


In [5]:
print(df_.A)
print("---------------------------------")
print(df_.columns)
print("---------------------------------")
print(df_.compound)
print("---------------------------------")

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
---------------------------------
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
---------------------------------
<bound method NDFrame._add_numeric_operations.<locals>.compound of      A          B    C  D      E    F
0  1.0 2019-09-20  1.0  3   test  foo
1  1.0 2019-09-20  1.0  3  train  foo
2  1.0 2019-09-20  1.0  3   test  foo
3  1.0 2019-09-20  1.0  3  train  foo>
---------------------------------


### Viewing Data

In [6]:
df

Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921
2019-01-04,-1.524455,0.319465,-0.592027,1.015569
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095
2019-01-06,1.839047,1.829464,-0.302522,1.089911


In [7]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734


In [8]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-01-04,-1.524455,0.319465,-0.592027,1.015569
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095
2019-01-06,1.839047,1.829464,-0.302522,1.089911


In [9]:
df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

DataFrame.to_numpy() creates a numpy representation of the pandas data

In [11]:
df.to_numpy()

array([[ 1.25312782,  1.37807137,  1.69945863,  0.64695353],
       [ 0.50227197,  0.77690884,  0.64669828,  0.09873419],
       [ 0.19493712, -1.12259377, -0.61538643, -0.35792108],
       [-1.52445534,  0.31946454, -0.59202728,  1.0155694 ],
       [-0.21192532, -0.10600342, -0.27242587, -0.04509469],
       [ 1.83904709,  1.82946373, -0.30252224,  1.08991107]])

DataFrame.describe() shows the statistic summary of the data

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.342167,0.512552,0.093966,0.408025
std,1.175509,1.062719,0.91086,0.596459
min,-1.524455,-1.122594,-0.615386,-0.357921
25%,-0.11021,0.000364,-0.519651,-0.009137
50%,0.348605,0.548187,-0.287474,0.372844
75%,1.065414,1.227781,0.416917,0.923415
max,1.839047,1.829464,1.699459,1.089911


Transposing data is easy as matrix

In [13]:
df.T

Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
A,1.253128,0.502272,0.194937,-1.524455,-0.211925,1.839047
B,1.378071,0.776909,-1.122594,0.319465,-0.106003,1.829464
C,1.699459,0.646698,-0.615386,-0.592027,-0.272426,-0.302522
D,0.646954,0.098734,-0.357921,1.015569,-0.045095,1.089911


Sorting data by axis

In [14]:
df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D
2019-01-06,1.839047,1.829464,-0.302522,1.089911
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095
2019-01-04,-1.524455,0.319465,-0.592027,1.015569
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-01,1.253128,1.378071,1.699459,0.646954


In [15]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2019-01-01,0.646954,1.699459,1.378071,1.253128
2019-01-02,0.098734,0.646698,0.776909,0.502272
2019-01-03,-0.357921,-0.615386,-1.122594,0.194937
2019-01-04,1.015569,-0.592027,0.319465,-1.524455
2019-01-05,-0.045095,-0.272426,-0.106003,-0.211925
2019-01-06,1.089911,-0.302522,1.829464,1.839047


Sorting by values

In [16]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095
2019-01-04,-1.524455,0.319465,-0.592027,1.015569
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-06,1.839047,1.829464,-0.302522,1.089911


### Selection

- **Getting**

In [17]:
df["A"]
# select a single column

2019-01-01    1.253128
2019-01-02    0.502272
2019-01-03    0.194937
2019-01-04   -1.524455
2019-01-05   -0.211925
2019-01-06    1.839047
Freq: D, Name: A, dtype: float64

In [18]:
df[0:3]
# select rows

Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921


- **Selection by label**

In [19]:
df.loc[dates[0]]

A    1.253128
B    1.378071
C    1.699459
D    0.646954
Name: 2019-01-01 00:00:00, dtype: float64

In [20]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2019-01-01,1.253128,1.378071
2019-01-02,0.502272,0.776909
2019-01-03,0.194937,-1.122594
2019-01-04,-1.524455,0.319465
2019-01-05,-0.211925,-0.106003
2019-01-06,1.839047,1.829464


In [21]:
df.loc["20190102":"20190104",["B","C"]]

Unnamed: 0,B,C
2019-01-02,0.776909,0.646698
2019-01-03,-1.122594,-0.615386
2019-01-04,0.319465,-0.592027


- **Selection by position**

In [22]:
df.iloc[3]

A   -1.524455
B    0.319465
C   -0.592027
D    1.015569
Name: 2019-01-04 00:00:00, dtype: float64

In [23]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2019-01-04,-1.524455,0.319465
2019-01-05,-0.211925,-0.106003


In [24]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2019-01-02,0.502272,0.646698
2019-01-03,0.194937,-0.615386
2019-01-05,-0.211925,-0.272426


In [25]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921


In [26]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2019-01-01,1.378071,1.699459
2019-01-02,0.776909,0.646698
2019-01-03,-1.122594,-0.615386
2019-01-04,0.319465,-0.592027
2019-01-05,-0.106003,-0.272426
2019-01-06,1.829464,-0.302522


In [27]:
df.iloc[1,1]

0.7769088407072803

In [28]:
# fast access to a sccalar, equivalent to the prior method
df.iat[1,1]

0.7769088407072803

- **Boolean indexing**

In [29]:
# Using a single column's values to select data
df[df.A>0]

Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921
2019-01-06,1.839047,1.829464,-0.302522,1.089911


In [30]:
# Selecting values from a DataFrame where a boolean condition is met
df[df>0]

Unnamed: 0,A,B,C,D
2019-01-01,1.253128,1.378071,1.699459,0.646954
2019-01-02,0.502272,0.776909,0.646698,0.098734
2019-01-03,0.194937,,,
2019-01-04,,0.319465,,1.015569
2019-01-05,,,,
2019-01-06,1.839047,1.829464,,1.089911


In [31]:
# Using isin() method for filtering
df2 = df.copy()
df2["E"] = ["one","one","two","three","four","three"]
df2

Unnamed: 0,A,B,C,D,E
2019-01-01,1.253128,1.378071,1.699459,0.646954,one
2019-01-02,0.502272,0.776909,0.646698,0.098734,one
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921,two
2019-01-04,-1.524455,0.319465,-0.592027,1.015569,three
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095,four
2019-01-06,1.839047,1.829464,-0.302522,1.089911,three


In [32]:
df2[df2["E"].isin(["two","four"])]

Unnamed: 0,A,B,C,D,E
2019-01-03,0.194937,-1.122594,-0.615386,-0.357921,two
2019-01-05,-0.211925,-0.106003,-0.272426,-0.045095,four


- **Setting**

In [40]:
# Setting a new column automatically aligns the data by the indexes
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range("20190102",periods=6))
df["F"]=s1
s1

2019-01-02    1
2019-01-03    2
2019-01-04    3
2019-01-05    4
2019-01-06    5
2019-01-07    6
Freq: D, dtype: int64

In [41]:
# Setting values by label
df.at[dates[0],"A"] = 0

# Setting values by position
df.iat[0,1] = 0

# Setting by assigning with a NumPy array
df.loc[:,"D"] = np.array([5]*len(df))

df

Unnamed: 0,A,B,C,D,F
2019-01-01,0.0,0.0,1.699459,5,
2019-01-02,0.502272,0.776909,0.646698,5,1.0
2019-01-03,0.194937,-1.122594,-0.615386,5,2.0
2019-01-04,-1.524455,0.319465,-0.592027,5,3.0
2019-01-05,-0.211925,-0.106003,-0.272426,5,4.0
2019-01-06,1.839047,1.829464,-0.302522,5,5.0


### Missing Data

In [42]:
# pandas uses np.nan to represent missing data
df1 = df.reindex(index = dates[0:4],columns = list(df.columns)+["E"])
df1.loc[dates[0]:dates[1],"E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2019-01-01,0.0,0.0,1.699459,5,,1.0
2019-01-02,0.502272,0.776909,0.646698,5,1.0,1.0
2019-01-03,0.194937,-1.122594,-0.615386,5,2.0,
2019-01-04,-1.524455,0.319465,-0.592027,5,3.0,


In [43]:
# To drop any row w/ missing data
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2019-01-02,0.502272,0.776909,0.646698,5,1.0,1.0


In [44]:
# Filling missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2019-01-01,0.0,0.0,1.699459,5,5.0,1.0
2019-01-02,0.502272,0.776909,0.646698,5,1.0,1.0
2019-01-03,0.194937,-1.122594,-0.615386,5,2.0,5.0
2019-01-04,-1.524455,0.319465,-0.592027,5,3.0,5.0


In [45]:
# To get the boolean mask where values are nan
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2019-01-01,False,False,False,False,True,False
2019-01-02,False,False,False,False,False,False
2019-01-03,False,False,False,False,False,True
2019-01-04,False,False,False,False,False,True


### Operations

- **Stats**

In [46]:
df.mean()

A    0.133313
B    0.282873
C    0.093966
D    5.000000
F    3.000000
dtype: float64

In [47]:
# same operation on the other axis
df.mean(1)

2019-01-01    1.674865
2019-01-02    1.585176
2019-01-03    1.091391
2019-01-04    1.240596
2019-01-05    1.681929
2019-01-06    2.673198
Freq: D, dtype: float64

In [48]:
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s

2019-01-01    NaN
2019-01-02    NaN
2019-01-03    1.0
2019-01-04    3.0
2019-01-05    5.0
2019-01-06    NaN
Freq: D, dtype: float64

In [49]:
df.sub(s,axis="index")

Unnamed: 0,A,B,C,D,F
2019-01-01,,,,,
2019-01-02,,,,,
2019-01-03,-0.805063,-2.122594,-1.615386,4.0,1.0
2019-01-04,-4.524455,-2.680535,-3.592027,2.0,0.0
2019-01-05,-5.211925,-5.106003,-5.272426,0.0,-1.0
2019-01-06,,,,,


Operating with objects that have different dimensionality and need alignment. Also, pandas automatically broadcasts along the secific dimension