In [1]:
import numpy as np
import pandas as pd



## object creation

In [3]:
# creat a series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [9]:
# creat a DataFrame
dates = pd.date_range('20191023', periods=6)
print(dates)

# pd.DataFrame(array, index, col_index)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('abcd'))
print(df)

DatetimeIndex(['2019-10-23', '2019-10-24', '2019-10-25', '2019-10-26',
               '2019-10-27', '2019-10-28'],
              dtype='datetime64[ns]', freq='D')
                   a         b         c         d
2019-10-23 -0.417845  1.608733 -0.814029 -0.382326
2019-10-24  0.275593  0.044249  0.342023 -0.072100
2019-10-25 -0.365925  0.183008  1.783170 -0.341942
2019-10-26 -0.863842 -2.388911 -0.202457 -0.841896
2019-10-27 -0.179464 -1.068517  1.434838 -0.992652
2019-10-28  0.449325  0.686399  1.125770  1.873668


In [13]:
# creating a DataFrame by passing a dict of object 
df2 = pd.DataFrame({'a':1,
                    'b':pd.Timestamp('20191025'),
                    'c':pd.Series(1,index=list(range(4)),dtype='float32'),
                    'd':np.array([3]*4, dtype='int32'),
                    'e': pd.Categorical(["test", "train", "test", "train"]),
                    'f':'foo'
                    })

print(df2)

   a          b    c  d      e    f
0  1 2019-10-25  1.0  3   test  foo
1  1 2019-10-25  1.0  3  train  foo
2  1 2019-10-25  1.0  3   test  foo
3  1 2019-10-25  1.0  3  train  foo


In [16]:
np.array([3]*4, dtype='int32')

array([3, 3, 3, 3], dtype=int32)

## viewing data

In [17]:
df.head()

Unnamed: 0,a,b,c,d
2019-10-23,-0.417845,1.608733,-0.814029,-0.382326
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-25,-0.365925,0.183008,1.78317,-0.341942
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652


In [20]:
df.tail(3)

Unnamed: 0,a,b,c,d
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652
2019-10-28,0.449325,0.686399,1.12577,1.873668


In [22]:
df.index

DatetimeIndex(['2019-10-23', '2019-10-24', '2019-10-25', '2019-10-26',
               '2019-10-27', '2019-10-28'],
              dtype='datetime64[ns]', freq='D')

In [23]:
df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

In [25]:
df.to_numpy()

array([[-0.4178448 ,  1.60873258, -0.81402923, -0.38232638],
       [ 0.27559321,  0.04424864,  0.34202265, -0.07210047],
       [-0.36592485,  0.18300844,  1.78317044, -0.34194197],
       [-0.86384225, -2.38891148, -0.2024566 , -0.84189566],
       [-0.17946364, -1.0685172 ,  1.43483809, -0.99265234],
       [ 0.44932517,  0.68639919,  1.12577004,  1.87366752]])

In [26]:
df2.to_numpy()

array([[1, Timestamp('2019-10-25 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2019-10-25 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1, Timestamp('2019-10-25 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2019-10-25 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

***Note:*** DataFrame.to_numpy() does not include the index or column labels in the output.

### decribe() shows a quick statistic summary of data


In [28]:
df.describe()

Unnamed: 0,a,b,c,d
count,6.0,6.0,6.0,6.0
mean,-0.183693,-0.15584,0.611553,-0.126208
std,0.482144,1.399823,1.008211,1.037063
min,-0.863842,-2.388911,-0.814029,-0.992652
25%,-0.404865,-0.790326,-0.066337,-0.727003
50%,-0.272694,0.113629,0.733896,-0.362134
75%,0.161829,0.560552,1.357571,-0.139561
max,0.449325,1.608733,1.78317,1.873668


### df.T transposing data

In [30]:
df.T

Unnamed: 0,2019-10-23 00:00:00,2019-10-24 00:00:00,2019-10-25 00:00:00,2019-10-26 00:00:00,2019-10-27 00:00:00,2019-10-28 00:00:00
a,-0.417845,0.275593,-0.365925,-0.863842,-0.179464,0.449325
b,1.608733,0.044249,0.183008,-2.388911,-1.068517,0.686399
c,-0.814029,0.342023,1.78317,-0.202457,1.434838,1.12577
d,-0.382326,-0.0721,-0.341942,-0.841896,-0.992652,1.873668


### sorting by values

In [32]:
df.sort_values(by='c')

Unnamed: 0,a,b,c,d
2019-10-23,-0.417845,1.608733,-0.814029,-0.382326
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-28,0.449325,0.686399,1.12577,1.873668
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652
2019-10-25,-0.365925,0.183008,1.78317,-0.341942


## Selection

### getting

In [36]:
print(df['a'])

2019-10-23   -0.417845
2019-10-24    0.275593
2019-10-25   -0.365925
2019-10-26   -0.863842
2019-10-27   -0.179464
2019-10-28    0.449325
Freq: D, Name: a, dtype: float64


In [39]:
# selecting via [], which slices the row
print(df[0:3])
print(df['20191024':'20191027'])

                   a         b         c         d
2019-10-23 -0.417845  1.608733 -0.814029 -0.382326
2019-10-24  0.275593  0.044249  0.342023 -0.072100
2019-10-25 -0.365925  0.183008  1.783170 -0.341942
                   a         b         c         d
2019-10-24  0.275593  0.044249  0.342023 -0.072100
2019-10-25 -0.365925  0.183008  1.783170 -0.341942
2019-10-26 -0.863842 -2.388911 -0.202457 -0.841896
2019-10-27 -0.179464 -1.068517  1.434838 -0.992652


### selection by label
loc:selecting by label

In [57]:
print(df)

                   a         b         c         d
2019-10-23 -0.417845  1.608733 -0.814029 -0.382326
2019-10-24  0.275593  0.044249  0.342023 -0.072100
2019-10-25 -0.365925  0.183008  1.783170 -0.341942
2019-10-26 -0.863842 -2.388911 -0.202457 -0.841896
2019-10-27 -0.179464 -1.068517  1.434838 -0.992652
2019-10-28  0.449325  0.686399  1.125770  1.873668


In [58]:
df.loc[dates[0]]


a   -0.417845
b    1.608733
c   -0.814029
d   -0.382326
Name: 2019-10-23 00:00:00, dtype: float64

In [59]:
# selecting on a multi-axis by label
df.loc[:, ['a','b']]

Unnamed: 0,a,b
2019-10-23,-0.417845,1.608733
2019-10-24,0.275593,0.044249
2019-10-25,-0.365925,0.183008
2019-10-26,-0.863842,-2.388911
2019-10-27,-0.179464,-1.068517
2019-10-28,0.449325,0.686399


In [61]:
# note: endpoints are included
df.loc['20191024':'20191027', ['a', 'b']]

Unnamed: 0,a,b
2019-10-24,0.275593,0.044249
2019-10-25,-0.365925,0.183008
2019-10-26,-0.863842,-2.388911
2019-10-27,-0.179464,-1.068517


### getting a scalar value


In [63]:
df.loc[dates[0], 'a']


-0.41784480220906567

In [65]:
df.at[dates[0], 'a']

-0.41784480220906567

### selection by position
iloc: selecting by index

In [67]:
df.iloc[3]

a   -0.863842
b   -2.388911
c   -0.202457
d   -0.841896
Name: 2019-10-26 00:00:00, dtype: float64

In [69]:
df.iloc[3:5,0:2]

Unnamed: 0,a,b
2019-10-26,-0.863842,-2.388911
2019-10-27,-0.179464,-1.068517


In [70]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,a,c
2019-10-24,0.275593,0.342023
2019-10-25,-0.365925,1.78317
2019-10-27,-0.179464,1.434838


In [72]:
df.iloc[1:3, :]
# endpoins are not included

Unnamed: 0,a,b,c,d
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-25,-0.365925,0.183008,1.78317,-0.341942


slicing columns explicitly


In [74]:
df.iloc[:, 2:4]

Unnamed: 0,c,d
2019-10-23,-0.814029,-0.382326
2019-10-24,0.342023,-0.0721
2019-10-25,1.78317,-0.341942
2019-10-26,-0.202457,-0.841896
2019-10-27,1.434838,-0.992652
2019-10-28,1.12577,1.873668


## boolean indexing

In [76]:
df.a # equivelantly df['a']

2019-10-23   -0.417845
2019-10-24    0.275593
2019-10-25   -0.365925
2019-10-26   -0.863842
2019-10-27   -0.179464
2019-10-28    0.449325
Freq: D, Name: a, dtype: float64

In [87]:
df['a'] > 0

2019-10-23    False
2019-10-24     True
2019-10-25    False
2019-10-26    False
2019-10-27    False
2019-10-28     True
Freq: D, Name: a, dtype: bool

In [89]:
df[df['a']>0]

Unnamed: 0,a,b,c,d
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-28,0.449325,0.686399,1.12577,1.873668


In [90]:
df[df>0]

Unnamed: 0,a,b,c,d
2019-10-23,,1.608733,,
2019-10-24,0.275593,0.044249,0.342023,
2019-10-25,,0.183008,1.78317,
2019-10-26,,,,
2019-10-27,,,1.434838,
2019-10-28,0.449325,0.686399,1.12577,1.873668


### using the isin() method for filtering 

In [96]:
df2 = df.copy()
df2['e'] = ['one', 'two', 'three', 'four', 'five', 'six']
df2

Unnamed: 0,a,b,c,d,e
2019-10-23,-0.417845,1.608733,-0.814029,-0.382326,one
2019-10-24,0.275593,0.044249,0.342023,-0.0721,two
2019-10-25,-0.365925,0.183008,1.78317,-0.341942,three
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896,four
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652,five
2019-10-28,0.449325,0.686399,1.12577,1.873668,six


In [99]:
df2[df2['e'].isin(['two', 'four'])]

Unnamed: 0,a,b,c,d,e
2019-10-24,0.275593,0.044249,0.342023,-0.0721,two
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896,four


## setting

In [102]:
s1 = pd.Series(list(range(1,7)), index=pd.date_range('20191111', periods=6))
print(s1)

2019-11-11    1
2019-11-12    2
2019-11-13    3
2019-11-14    4
2019-11-15    5
2019-11-16    6
Freq: D, dtype: int64


### setting values by label


In [104]:
df.at[dates[0], 'a'] = 0

In [105]:
df

Unnamed: 0,a,b,c,d
2019-10-23,0.0,1.608733,-0.814029,-0.382326
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-25,-0.365925,0.183008,1.78317,-0.341942
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652
2019-10-28,0.449325,0.686399,1.12577,1.873668


### setting values by position

In [106]:
df.iat[0,1] = 0

In [107]:
df

Unnamed: 0,a,b,c,d
2019-10-23,0.0,0.0,-0.814029,-0.382326
2019-10-24,0.275593,0.044249,0.342023,-0.0721
2019-10-25,-0.365925,0.183008,1.78317,-0.341942
2019-10-26,-0.863842,-2.388911,-0.202457,-0.841896
2019-10-27,-0.179464,-1.068517,1.434838,-0.992652
2019-10-28,0.449325,0.686399,1.12577,1.873668


### setting by assigning with a numpy array

In [113]:
df.loc[:, 'd'] = np.array([5] * len(df))

In [114]:
df

Unnamed: 0,a,b,c,d
2019-10-23,0.0,0.0,-0.814029,5
2019-10-24,0.275593,0.044249,0.342023,5
2019-10-25,-0.365925,0.183008,1.78317,5
2019-10-26,-0.863842,-2.388911,-0.202457,5
2019-10-27,-0.179464,-1.068517,1.434838,5
2019-10-28,0.449325,0.686399,1.12577,5


## missing data
### df.reindex()

In [118]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['e'])
df1.loc[dates[0]:dates[1], 'e'] = 1
df1

Unnamed: 0,a,b,c,d,e
2019-10-23,0.0,0.0,-0.814029,5,1.0
2019-10-24,0.275593,0.044249,0.342023,5,1.0
2019-10-25,-0.365925,0.183008,1.78317,5,
2019-10-26,-0.863842,-2.388911,-0.202457,5,


In [119]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,a,b,c,d,e
2019-10-23,0.0,0.0,-0.814029,5,1.0
2019-10-24,0.275593,0.044249,0.342023,5,1.0


In [122]:
# drop all rows that have missing data
df1.dropna(how='all')

Unnamed: 0,a,b,c,d,e
2019-10-23,0.0,0.0,-0.814029,5,1.0
2019-10-24,0.275593,0.044249,0.342023,5,1.0
2019-10-25,-0.365925,0.183008,1.78317,5,
2019-10-26,-0.863842,-2.388911,-0.202457,5,


In [127]:
# filling missing data
df2 = df1.fillna(value='missing')
print(df2)
# df2.loc[:,'e']

                   a         b         c  d        e
2019-10-23  0.000000  0.000000 -0.814029  5        1
2019-10-24  0.275593  0.044249  0.342023  5        1
2019-10-25 -0.365925  0.183008  1.783170  5  missing
2019-10-26 -0.863842 -2.388911 -0.202457  5  missing


In [128]:
# get the boolean mask where values are nan
pd.isna(df1)

Unnamed: 0,a,b,c,d,e
2019-10-23,False,False,False,False,False
2019-10-24,False,False,False,False,False
2019-10-25,False,False,False,False,True
2019-10-26,False,False,False,False,True


## operations

### stats

In [135]:
# performing a descriptive statistic
print(df.describe())
print('*'*50)
print(df.mean())
print('*'*50)
print(df.mean(axis=1))
print('*'*50)
print(df.mean(axis=0))


              a         b         c    d
count  6.000000  6.000000  6.000000  6.0
mean  -0.114052 -0.423962  0.611553  5.0
std    0.471620  1.120425  1.008211  0.0
min   -0.863842 -2.388911 -0.814029  5.0
25%   -0.319310 -0.801388 -0.066337  5.0
50%   -0.089732  0.022124  0.733896  5.0
75%    0.206695  0.148318  1.357571  5.0
max    0.449325  0.686399  1.783170  5.0
**************************************************
a   -0.114052
b   -0.423962
c    0.611553
d    5.000000
dtype: float64
**************************************************
2019-10-23    1.046493
2019-10-24    1.415466
2019-10-25    1.650064
2019-10-26    0.386197
2019-10-27    1.296714
2019-10-28    1.815374
Freq: D, dtype: float64
**************************************************
a   -0.114052
b   -0.423962
c    0.611553
d    5.000000
dtype: float64


In [147]:
s = pd.Series([1,3,5,np.nan, 6,8],index=dates)
s



2019-10-23    1.0
2019-10-24    3.0
2019-10-25    5.0
2019-10-26    NaN
2019-10-27    6.0
2019-10-28    8.0
Freq: D, dtype: float64

In [151]:
print(df.sub(s, axis='index'))
df

                   a         b         c    d
2019-10-23 -1.000000 -1.000000 -1.814029  4.0
2019-10-24 -2.724407 -2.955751 -2.657977  2.0
2019-10-25 -5.365925 -4.816992 -3.216830  0.0
2019-10-26       NaN       NaN       NaN  NaN
2019-10-27 -6.179464 -7.068517 -4.565162 -1.0
2019-10-28 -7.550675 -7.313601 -6.874230 -3.0


Unnamed: 0,a,b,c,d
2019-10-23,0.0,0.0,-0.814029,5
2019-10-24,0.275593,0.044249,0.342023,5
2019-10-25,-0.365925,0.183008,1.78317,5
2019-10-26,-0.863842,-2.388911,-0.202457,5
2019-10-27,-0.179464,-1.068517,1.434838,5
2019-10-28,0.449325,0.686399,1.12577,5


### apply
applying functions to the data

In [153]:
df.apply(np.cumsum)


Unnamed: 0,a,b,c,d
2019-10-23,0.0,0.0,-0.814029,5
2019-10-24,0.275593,0.044249,-0.472007,10
2019-10-25,-0.090332,0.227257,1.311164,15
2019-10-26,-0.954174,-2.161654,1.108707,20
2019-10-27,-1.133638,-3.230172,2.543545,25
2019-10-28,-0.684312,-2.543772,3.669315,30


In [154]:
df.apply(lambda x:x.max() - x.min())

a    1.313167
b    3.075311
c    2.597200
d    0.000000
dtype: float64

### histogramming

In [155]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    6
1    2
2    4
3    1
4    3
5    6
6    4
7    0
8    1
9    3
dtype: int64

In [156]:
s.value_counts()

6    2
4    2
3    2
1    2
2    1
0    1
dtype: int64