In [1]:
#Pandas library also contains a set of a common mathematical and statistical methods
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
frame_obj = DataFrame([[2,4.3],[np.nan,np.nan],[-3.1,5],[11,5.6]], index=['a','b','c','d'], columns=['one','two'])
frame_obj

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-3.1,5.0
d,11.0,5.6


In [18]:
#Summary Statistics/Reduction Methods
#Summation method: sum
frame_obj.sum() #Returns sum of columns

one     9.9
two    14.9
dtype: float64

In [5]:
#By deafult axis parameter is 0, the summation can be performed to rows by changing axis to 1
frame_obj.sum(axis = 1)

a     6.3
b     0.0
c     1.9
d    16.6
dtype: float64

In [6]:
#The NaN values as seen above are ignored. To disable this feature, wehave the skipna argument
frame_obj.sum(axis=1, skipna=False)

a     6.3
b     NaN
c     1.9
d    16.6
dtype: float64

In [7]:
#Mean method: mean
frame_obj.mean()

one    3.300000
two    4.966667
dtype: float64

In [9]:
#For rows:
frame_obj.mean(axis=1)

a    3.15
b     NaN
c    0.95
d    8.30
dtype: float64

In [25]:
#Count method for non-NaN values : count
frame_obj.count(axis=1)

a    2
b    0
c    2
d    2
dtype: int64

In [38]:
#Maximum value: max
frame_obj.max(axis = 1, skipna = False)

a     4.3
b     NaN
c     5.0
d    11.0
dtype: float64

In [39]:
#Minimum value: min
frame_obj.min()

one   -3.1
two    4.3
dtype: float64

In [26]:
#Median method: median
frame_obj.median()

one    2.0
two    5.0
dtype: float64

In [27]:
#Mean absolute Deviation method : mad
frame_obj.mad()

one    5.133333
two    0.444444
dtype: float64

In [28]:
#Variance method: var
frame_obj.var(axis =1)

a     2.645
b       NaN
c    32.805
d    14.580
dtype: float64

In [29]:
#Standard Deviation method: std
frame_obj.std(axis=1)

a    1.626346
b         NaN
c    5.727565
d    3.818377
dtype: float64

In [30]:
#Skewness method: skew
frame_obj.skew()

one    0.792236
two   -0.229937
dtype: float64

In [32]:
#Kurtosis method: kurt
frame_obj.kurt(axis=1)

a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64

In [19]:
#Indirect Methods
#Identity maximum: idxmax
frame_obj.idxmax() #Returns the index number instead of maximum value

one    d
two    d
dtype: object

In [20]:
frame_obj.idxmax(axis=1) #For rows

a    two
b    NaN
c    two
d    one
dtype: object

In [21]:
#Identity minimum: idxmin
frame_obj.idxmin(axis=1, skipna=False) #Returns the index number instead of minimum value

a    one
b    NaN
c    one
d    two
dtype: object

In [35]:
#Integer identity maximum: argmax
frame_obj.values.argmax()

2

In [36]:
#Integer identity minimum: argmin
frame_obj.values.argmin(axis=1)

array([0, 0, 0, 1], dtype=int32)

In [22]:
#Accumulation Methods
#Cummulative sum: cumsum
frame_obj.cumsum()

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-1.1,9.3
d,9.9,14.9


In [41]:
#Quantile value: quantile
frame_obj.quantile()

one    2.0
two    5.0
Name: 0.5, dtype: float64

In [42]:
#Cummulative minimum: cummin
frame_obj.cummin()

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-3.1,4.3
d,-3.1,4.3


In [43]:
#Cummulative maximum: cummax
frame_obj.cummax()

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,2.0,5.0
d,11.0,5.6


In [44]:
#Cummulative product: cumprod
frame_obj.cumprod()

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-6.2,21.5
d,-68.2,120.4


In [45]:
#Arithmetic Difference: diff (1st only)
frame_obj.diff()

Unnamed: 0,one,two
a,,
b,,
c,,
d,14.1,0.6


In [46]:
#Percentage Changes: pct_change
frame_obj.pct_change()

Unnamed: 0,one,two
a,,
b,0.0,0.0
c,-2.55,0.162791
d,-4.548387,0.12


In [47]:
#Non Reduction Non Accumulation Methods: Multi-Summary Statistical Methods
#Describe summary stats: describe
frame_obj.describe()

Unnamed: 0,one,two
count,3.0,3.0
mean,3.3,4.966667
std,7.139328,0.650641
min,-3.1,4.3
25%,-0.55,4.65
50%,2.0,5.0
75%,6.5,5.3
max,11.0,5.6


In [50]:
#For non-numeric data, summary stats are alternated
series_obj = Series(list('adbc')*3)
series_obj

0     a
1     d
2     b
3     c
4     a
5     d
6     b
7     c
8     a
9     d
10    b
11    c
dtype: object

In [52]:
series_obj.describe()

count     12
unique     4
top        d
freq       3
dtype: object

In [56]:
#Correlation: corr
#For DataFrame and Series:
frame_obj

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-3.1,5.0
d,11.0,5.6


In [58]:
frame_obj['one'].corr(frame_obj['two']) #finds correlation between the two columns

0.5952461371449761

In [59]:
frame_obj.corr()

Unnamed: 0,one,two
one,1.0,0.595246
two,0.595246,1.0


In [60]:
#Covariance: cov
#For DataFrame and Series:
frame_obj

Unnamed: 0,one,two
a,2.0,4.3
b,,
c,-3.1,5.0
d,11.0,5.6


In [62]:
frame_obj['one'].cov(frame_obj['two'])

2.764999999999999

In [63]:
frame_obj.cov()

Unnamed: 0,one,two
one,50.97,2.765
two,2.765,0.423333


In [67]:
#We can also apply coorelation functions between dataframes and series

#Dataframe and Series
frame_obj.corrwith(frame_obj['one'])

one    1.000000
two    0.595246
dtype: float64

In [69]:
#Dataframe and Dataframe
frame_obj.corrwith(frame_obj)

one    1.0
two    1.0
dtype: float64

In [71]:
#Passing axis value changes to row wise operations
frame_obj.corrwith(frame_obj['two'], axis = 0)

one    0.595246
two    1.000000
dtype: float64