# Useful APIs of pandas

@author glin

In [113]:
import pandas as pd
import numpy as np

In [151]:
%config IPCompleter.greedy=True

In [152]:
# mapping between sql and pandas
# https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html
# https://medium.com/jbennetcodes/how-to-rewrite-your-sql-queries-in-pandas-and-more-149d341fc53e

In [212]:
# create dataframe
df = pd.DataFrame([['1','2','6'], ['3', '4','6'], [np.NaN, '5',np.NaN], ['1', 9,'3']], columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
0,1.0,2,6.0
1,3.0,4,6.0
2,,5,
3,1.0,9,3.0


In [198]:
# group by a and count number of rows of all columns except 'a' 
# even if they are duplicate
# only for non-NaN 
df.groupby('a').count()

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
3,1,1


In [200]:
# group by a and count number of rows for 'a' 
# even if they are duplicate
# NaN not counted (same as sql)
df.groupby('a').size()

a
1    2
3    1
dtype: int64

In [132]:
# group by a and count number of rows for 'a'
# NaN not counted (same as sql)
df.a.value_counts()

1    2
3    1
Name: a, dtype: int64

In [165]:
# count distinct a, NaN not counted (same as sql)
df.a.value_counts().count()

2

In [193]:
# count distinct a, NaN not counted (same as sql)
df.groupby('a').size().count()

2

In [201]:
# count distinct a, NaN counted
len(df.a.unique())

3

###############

In [142]:
# convert series to dataframe
s = pd.Series(['1', '2'])
s

0    1
1    2
dtype: object

In [147]:
# count(), sum() are both regarding series
s.count()

2

In [192]:
df1 = pd.DataFrame(s, columns = ['a'])
df1

Unnamed: 0,a
0,1
1,2


In [189]:
df1 = s.to_frame('a')
df1

Unnamed: 0,a
0,1
1,2


In [None]:
# convert string to float
df1.a = df1.a.astype(float)

In [None]:
# check type
type(df1.a)

In [None]:
df1.a.dtype

In [105]:
# convert string with $ to float
dollarizer = lambda x: float(x[1:-1])
df = pd.DataFrame([['@1.2','2'], ['$3.1', '4']], columns = ['a','b'])
df = df.a.apply(dollarizer)
df

0    1.0
1    3.0
Name: a, dtype: float64

In [213]:
# groupby a column and then sum, this will sum the rows of all columns
# if the values are string, the sum will concat string
df.groupby('a').sum()

Unnamed: 0_level_0,c
a,Unnamed: 1_level_1
1,63
3,6


In [214]:
# sum() does not like different data types, it does not return that column if different types exist in that column

In [205]:
# sort
df.sort_values(['b'], ascending=False)

Unnamed: 0,a,b,c
2,,5.0,
1,3.0,4.0,6.0
0,1.0,2.0,6.0
3,1.0,,3.0


In [184]:
# groupby sorts in asc order automatically
df.groupby(['c','a']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,b
c,a,Unnamed: 2_level_1
3,1,0
6,1,1
6,3,1


In [187]:
# the above equals to this
df.groupby(['c','a']).count().sort_values(['c', 'a'], ascending=[True, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,b
c,a,Unnamed: 2_level_1
3,1,0
6,1,1
6,3,1


In [182]:
df.groupby(['c','a']).size()

c  a
3  1    1
6  1    1
   3    1
dtype: int64

In [183]:
# after groupby on multiple columns, reset_index() adds the index that are empty
df.groupby(['c','a']).size().reset_index()

Unnamed: 0,c,a,0
0,3,1,1
1,6,1,1
2,6,3,1


In [178]:
df.groupby(['c','a']).size().to_frame('size').reset_index()

Unnamed: 0,c,a,size
0,3,1,1
1,6,1,1
2,6,3,1


In [215]:
df.groupby(['c','a']).size().to_frame('size').reset_index().sort_values(['c', 'size'], ascending=[True, True])

Unnamed: 0,c,a,size
0,3,1,1
1,6,1,1
2,6,3,1


 
 
##############
 
 

In [62]:
# select where condition
# filter by value
df[df['b'] == '2']

Unnamed: 0,a,b
0,1,2


In [65]:
df[[True, False, False, False]]

Unnamed: 0,a,b
0,1,2


In [None]:
# use in clause
df[df['b'].isin(['2','5'])]

In [69]:
# use not in clause
df[~df['b'].isin(['2','5'])]

Unnamed: 0,a,b
1,3,4
3,1,4


In [93]:
df['b'].isin(['2','5'])

0     True
1    False
2     True
3    False
Name: b, dtype: bool

In [94]:
df['a']=='1'

0     True
1    False
2     True
3     True
Name: a, dtype: bool

In [100]:
df['b'].isin(['2','5']) & df['a']=='1'

0    False
1    False
2    False
3    False
dtype: bool

In [99]:
# Have to use () when using &
(df['b'].isin(['2','5'])) & (df['a']=='1')

0     True
1    False
2     True
3    False
dtype: bool

In [101]:
# sort
df.b.sort_values(ascending=False)

2    5
3    4
1    4
0    2
Name: b, dtype: object

In [103]:
# sum specific columns
df.groupby('a')['b'].sum(axis = 1, skipna = True)

a
1    254
3      4
Name: b, dtype: object

In [104]:
df.groupby('a')['b'].sum()

a
1    254
3      4
Name: b, dtype: object

In [134]:
# is null
df[df['b'].isna()]

Unnamed: 0,a,b,c
3,1,,3


In [135]:
# not null
df[df['b'].notna()]

Unnamed: 0,a,b,c
0,,2,6.0
1,3.0,4,4.0
2,1.0,5,


##############

In [238]:
# create dataframe
df = pd.DataFrame([['1','2','6'], ['3', '4','6'], [np.NaN, '5',np.NaN], ['1', '9','3']], columns = ['a','b','c'])
df

Unnamed: 0,a,b,c
0,1.0,2,6.0
1,3.0,4,6.0
2,,5,
3,1.0,9,3.0


In [178]:
df.groupby(['c','a']).size().to_frame('size').reset_index()

Unnamed: 0,c,a,size
0,3,1,1
1,6,1,1
2,6,3,1


In [178]:
df.groupby(['c','a']).size().to_frame('size').reset_index()

Unnamed: 0,c,a,size
0,3,1,1
1,6,1,1
2,6,3,1


In [235]:
# group by a having count(*) > 1
df.groupby('a').filter(lambda g: len(g) > 1).groupby('a')#.size()#.sort_values(ascending=False)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11ea94668>

In [242]:
# select count(*) from df group by a having count(*) > 0 order by count(*)
df.groupby('a').filter(lambda g: len(g) > 0).groupby('a').size().sort_values(ascending=False)

a
1    2
3    1
dtype: int64

###########

In [250]:
# create dataframe
df = pd.DataFrame([['1','2','6'], ['3', '4','6'], [np.NaN, '5',np.NaN], ['1', '9','3']], columns = ['a','b','c'])
df = df.astype(float)
df

Unnamed: 0,a,b,c
0,1.0,2.0,6.0
1,3.0,4.0,6.0
2,,5.0,
3,1.0,9.0,3.0


In [248]:
# agg
df.agg({'b': ['min', 'max', 'mean', 'median']})

Unnamed: 0,b
min,2.0
max,9.0
mean,5.0
median,4.5


In [251]:
df.agg({'b': ['min', 'max', 'mean', 'median']}).T

Unnamed: 0,min,max,mean,median
b,2.0,9.0,5.0,4.5
