## Panda DataFrames Followup

In [19]:
import pandas as pd
import numpy as np

In [2]:
my_dict = { 'name' : ["a", "b", "c", "d", "e","f", "g"],
                   'age' : [20,27, 35, 55, 18, 21, 35],
                   'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]}
df = pd.DataFrame(my_dict)
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [3]:
df.to_csv('csv_example')

In [4]:
df_csv = pd.read_csv('csv_example')

In [5]:
df_csv

Unnamed: 0.1,Unnamed: 0,name,age,designation
0,0,a,20,VP
1,1,b,27,CEO
2,2,c,35,CFO
3,3,d,55,VP
4,4,e,18,VP
5,5,f,21,CEO
6,6,g,35,MD


In [6]:
df.to_csv('csv_example', index=False)
df_csv = pd.read_csv('csv_example')
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


It’s possible to have more than one row as column headers by specifying a parameter called `header=<integer>` in the `read_csv(...)` function.

In [7]:
df_csv = pd.read_csv('csv_example', header=[0,1,2])
df_csv

Unnamed: 0_level_0,name,age,designation
Unnamed: 0_level_1,a,20,VP
Unnamed: 0_level_2,b,27,CEO
0,c,35,CFO
1,d,55,VP
2,e,18,VP
3,f,21,CEO
4,g,35,MD


In [8]:
df_csv = pd.read_csv('csv_example', header=5)
df_csv
# Drawback: truncates all previous data

Unnamed: 0,e,18,VP
0,f,21,CEO
1,g,35,MD


In [9]:
#Even in the case of having multiple rows as header, actual DataFrame data shall start only with rows after the last header rows.
df_csv = pd.read_csv('csv_example', header=[1,2,5])
df_csv

Unnamed: 0_level_0,a,20,VP
Unnamed: 0_level_1,b,27,CEO
Unnamed: 0_level_2,e,18,VP
0,f,21,CEO
1,g,35,MD


In [10]:
# The "names" parameter lets you specify a custom header, and the "header" parameter specifies the row number of a pre-existing header in the CSV.
df_csv = pd.read_csv('csv_example', names=['a', 'b', 'c'], header=1)
df_csv

Unnamed: 0,a,b,c
0,b,27,CEO
1,c,35,CFO
2,d,55,VP
3,e,18,VP
4,f,21,CEO
5,g,35,MD


You can also skip a pre-existing header when writing the CSV file: `df.to_csv('csv_example', index=False, header = False)`

In [11]:
df.to_csv('csv_example', index=False, sep=":")
# This will create a file where the colon (‘:’) instead of comma (‘,’) shall be used as a separator.

df_csv = pd.read_csv('csv_example', sep=":")

In [12]:
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [15]:
#By default, Pandas DataFrame generates a row index automatically which we can change by setting any column as the Index as:

df_csv = df_csv.set_index('age')
df_csv

Unnamed: 0_level_0,name,designation
age,Unnamed: 1_level_1,Unnamed: 2_level_1
20,a,VP
27,b,CEO
35,c,CFO
55,d,VP
18,e,VP
21,f,CEO
35,g,MD


In [17]:
df_csv = pd.read_csv('csv_example', sep=":", index_col=[0,2])
df_csv

Unnamed: 0_level_0,Unnamed: 1_level_0,age
name,designation,Unnamed: 2_level_1
a,VP,20
b,CEO,27
c,CFO,35
d,VP,55
e,VP,18
f,CEO,21
g,MD,35


In [18]:
# Load Only 3 Rows
df_csv = pd.read_csv('csv_example', sep=":", nrows=3)
df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO


In [None]:
#Don't skip empty lines in CSV file
df_csv = pd.read_csv('csv_example', skip_blank_lines=False, sep=":")

## Pandas Basics II

#### Boolean comparisons

In [20]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [21]:
df2 = df.copy()

Equality operators:

* eq: Test whether a field is equal to a constant value
* ne: Test whether a field is not equal to a constant value

Range operators:

* gt: Test whether a field is greater than a constant value
* lt: Test whether a field is less than a constant value
* ge: Test whether a field is greater than or equal to a constant value
* le: Test whether a field is less than or equal to a constant value

In [24]:
df2

Unnamed: 0,one,two,three
a,-0.067564,-1.985749,
b,1.018586,0.677054,0.322755
c,-0.184969,0.007482,0.322124
d,,-1.105022,-0.664936


In [22]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [23]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


Note `np.nan == np.nan` returns `False`.

In [25]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [26]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [27]:
(df > 0).any().any()

True

To evaluate single-element pandas objects in a boolean context, use the method `bool()`:

`pd.Series([True]).bool()`

`True`

#### Objects Comparison

In [28]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [29]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])
# (series lengths must be the same)

0     True
1     True
2    False
dtype: bool

In [30]:
# To test that these two computations produce the same result
(df + df == df * 2).all().all()

False

In [31]:
# False since np.nan != np.nan in Pandas,
# but you can use a special Pandas method to get around this:

In [32]:
(df + df).equals(df * 2)

True

#### Descriptive statistics

In [33]:
df

Unnamed: 0,one,two,three
a,-0.067564,-1.985749,
b,1.018586,0.677054,0.322755
c,-0.184969,0.007482,0.322124
d,,-1.105022,-0.664936


In [34]:
# Aggregation for each column
df.mean(0)

one      0.255351
two     -0.601559
three   -0.006685
dtype: float64

In [35]:
# Aggregation for each index
df.mean(1)

a   -1.026656
b    0.672798
c    0.048212
d   -0.884979
dtype: float64

#### Describe

In [36]:
series = pd.Series(np.random.randn(1000))

In [37]:
series[::2] = np.nan

In [38]:
series.describe()

count    500.000000
mean      -0.000475
std        0.951125
min       -3.014166
25%       -0.645502
50%        0.015385
75%        0.631538
max        2.979796
dtype: float64

In [41]:
frame = pd.DataFrame(np.random.randn(1000, 5),
       columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.036632,-0.050179,0.02996,0.059382,-0.020363
std,0.980185,1.017377,1.052205,0.979849,0.976271
min,-2.728229,-2.930925,-3.541612,-3.245106,-3.54005
25%,-0.606806,-0.749325,-0.677141,-0.580328,-0.660693
50%,0.080765,0.033522,0.022058,0.031928,-0.056959
75%,0.729936,0.605722,0.691627,0.749767,0.576354
max,2.798584,3.325519,3.336571,3.389574,2.973554


In [42]:
s1 = pd.Series(np.random.randn(5))
s1

0   -0.690717
1    1.958262
2   -0.450652
3    0.788300
4    0.981523
dtype: float64

In [43]:
s1.idxmin(), s1.idxmax()

(0, 1)

In [44]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-0.489995,-0.007454,-0.606371
1,-0.145454,-0.634294,0.420386
2,-0.258626,0.167511,-0.569086
3,1.909118,-0.887317,0.687882
4,1.227489,-1.845285,0.308496


In [45]:
df1.idxmin(axis=0)

A    0
B    4
C    0
dtype: int64

In [46]:
df1.idxmax(axis=1)

0    B
1    C
2    B
3    A
4    A
dtype: object

#### Iterations

In [47]:
df = pd.DataFrame({'col1': np.random.randn(3),
                     'col2': np.random.randn(3)}, index=['a', 'b', 'c'])

In [48]:
for col in df:
        print(col)

col1
col2


items:

In [49]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
for label, ser in df.items():
        print(label)
        print(ser) 

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


iterrows:

In [50]:
for row_index, row in df.iterrows():
        print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


itertuple:

In [51]:
for row in df.itertuples():
        print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


## Pandas Viewing

#### Object creation

In [52]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [53]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [58]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.833272,-0.842254,0.39987,-1.929491
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-03,0.956021,0.823399,-1.047131,-0.085482
2013-01-04,-1.380245,0.194913,1.578114,-0.736668
2013-01-05,-1.715609,1.885543,0.472406,-0.101344
2013-01-06,1.766563,-0.492605,-1.029834,1.060679


In [54]:
df2 = pd.DataFrame({'A': 1.,
                       'B': pd.Timestamp('20130102'),
                       'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                       'D': np.array([3] * 4, dtype='int32'),
                       'E': pd.Categorical(["test", "train", "test", "train"]),
                       'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


#### Viewing data

In [59]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.833272,-0.842254,0.39987,-1.929491
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-03,0.956021,0.823399,-1.047131,-0.085482
2013-01-04,-1.380245,0.194913,1.578114,-0.736668
2013-01-05,-1.715609,1.885543,0.472406,-0.101344


In [61]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.380245,0.194913,1.578114,-0.736668
2013-01-05,-1.715609,1.885543,0.472406,-0.101344
2013-01-06,1.766563,-0.492605,-1.029834,1.060679


In [62]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [63]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [64]:
df.to_numpy()

array([[-0.83327189, -0.84225431,  0.39986971, -1.92949057],
       [-0.49003957, -0.09708229,  2.12814638, -1.1377136 ],
       [ 0.95602097,  0.82339938, -1.04713101, -0.0854817 ],
       [-1.38024513,  0.19491295,  1.57811389, -0.7366679 ],
       [-1.71560895,  1.88554334,  0.47240647, -0.10134439],
       [ 1.76656304, -0.49260487, -1.02983392,  1.06067852]])

In [65]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.282764,0.245319,0.416929,-0.488337
std,1.366591,0.987081,1.305111,1.026819
min,-1.715609,-0.842254,-1.047131,-1.929491
25%,-1.243502,-0.393724,-0.672408,-1.037452
50%,-0.661656,0.048915,0.436138,-0.419006
75%,0.594506,0.666278,1.301687,-0.089447
max,1.766563,1.885543,2.128146,1.060679


In [66]:
# Transpose
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.833272,-0.49004,0.956021,-1.380245,-1.715609,1.766563
B,-0.842254,-0.097082,0.823399,0.194913,1.885543,-0.492605
C,0.39987,2.128146,-1.047131,1.578114,0.472406,-1.029834
D,-1.929491,-1.137714,-0.085482,-0.736668,-0.101344,1.060679


In [67]:
# Sort by column name
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-1.929491,0.39987,-0.842254,-0.833272
2013-01-02,-1.137714,2.128146,-0.097082,-0.49004
2013-01-03,-0.085482,-1.047131,0.823399,0.956021
2013-01-04,-0.736668,1.578114,0.194913,-1.380245
2013-01-05,-0.101344,0.472406,1.885543,-1.715609
2013-01-06,1.060679,-1.029834,-0.492605,1.766563


In [68]:
# Sort by specified column
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-01,-0.833272,-0.842254,0.39987,-1.929491
2013-01-06,1.766563,-0.492605,-1.029834,1.060679
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-04,-1.380245,0.194913,1.578114,-0.736668
2013-01-03,0.956021,0.823399,-1.047131,-0.085482
2013-01-05,-1.715609,1.885543,0.472406,-0.101344


## Pandas Filtering

In [71]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.833272,-0.842254,0.39987,-1.929491
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-03,0.956021,0.823399,-1.047131,-0.085482
2013-01-04,-1.380245,0.194913,1.578114,-0.736668
2013-01-05,-1.715609,1.885543,0.472406,-0.101344
2013-01-06,1.766563,-0.492605,-1.029834,1.060679


#### Selection

In [69]:
df['A']

2013-01-01   -0.833272
2013-01-02   -0.490040
2013-01-03    0.956021
2013-01-04   -1.380245
2013-01-05   -1.715609
2013-01-06    1.766563
Freq: D, Name: A, dtype: float64

In [70]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.833272,-0.842254,0.39987,-1.929491
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-03,0.956021,0.823399,-1.047131,-0.085482


In [72]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.49004,-0.097082,2.128146,-1.137714
2013-01-03,0.956021,0.823399,-1.047131,-0.085482
2013-01-04,-1.380245,0.194913,1.578114,-0.736668


In [73]:
df.loc["2013-01-01"]

A   -0.833272
B   -0.842254
C    0.399870
D   -1.929491
Name: 2013-01-01 00:00:00, dtype: float64

In [74]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.833272,-0.842254
2013-01-02,-0.49004,-0.097082
2013-01-03,0.956021,0.823399
2013-01-04,-1.380245,0.194913
2013-01-05,-1.715609,1.885543
2013-01-06,1.766563,-0.492605


In [75]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.49004,-0.097082
2013-01-03,0.956021,0.823399
2013-01-04,-1.380245,0.194913


In [76]:
df.loc['20130102', ['A', 'B']]

A   -0.490040
B   -0.097082
Name: 2013-01-02 00:00:00, dtype: float64

In [77]:
df.loc[dates[0], 'A']

-0.833271886569826

In [78]:
df.iloc[3]

A   -1.380245
B    0.194913
C    1.578114
D   -0.736668
Name: 2013-01-04 00:00:00, dtype: float64

#### Selection by dtype

In [79]:
df = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})

In [80]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


#### Boolean indexing

In [84]:
df['float64']

0    4.0
1    5.0
2    6.0
Name: float64, dtype: float64

In [81]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,5,6.0,True,False,2021-07-29 15:49:39.649765,C


In [85]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']

In [86]:
# Also keep in mind the "isin()" function
df2[df2['E'].isin(['one','two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2021-07-27 15:49:39.649765,A,one
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B,two


#### Setting values

By position:

In [88]:
df.iat[0, 1] = -1
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-1,3,4.0,True,False,2021-07-27 15:49:39.649765,A
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,5,6.0,True,False,2021-07-29 15:49:39.649765,C


In [89]:
df.iloc[0, 1] = 2
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,4.0,True,False,2021-07-27 15:49:39.649765,A
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,5,6.0,True,False,2021-07-29 15:49:39.649765,C


By label:

In [90]:
df.at[0, 'float64'] = -10
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,-10.0,True,False,2021-07-27 15:49:39.649765,A
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,5,6.0,True,False,2021-07-29 15:49:39.649765,C


In [91]:
df.loc[0, 'float64'] = -20
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,-20.0,True,False,2021-07-27 15:49:39.649765,A
1,b,2,4,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,5,6.0,True,False,2021-07-29 15:49:39.649765,C


By assigning with a NumPy array:

In [92]:
df.loc[:, 'uint8'] = np.array([50] * len(df))
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-20.0,True,False,2021-07-27 15:49:39.649765,A
1,b,2,50,5.0,False,True,2021-07-28 15:49:39.649765,B
2,c,3,50,6.0,True,False,2021-07-29 15:49:39.649765,C


## Pandas Merge and GroupBy

In [94]:
df = pd.DataFrame(np.random.randn(10, 4))

In [95]:
df

Unnamed: 0,0,1,2,3
0,-0.861258,-0.141494,-1.945556,0.883121
1,2.200086,-0.324974,-0.54571,1.097388
2,-1.153871,-0.053885,-0.102897,1.881248
3,-0.976569,-0.503201,-0.719655,0.335012
4,-0.046094,-0.898358,-0.552782,0.265985
5,2.064766,0.294196,0.731989,0.064205
6,-2.320093,-0.995963,-0.878379,-0.363436
7,-0.102406,-1.564402,-0.80322,-1.075716
8,-1.07612,-0.700521,-0.156769,-0.930605
9,-0.610936,0.287967,0.312841,1.045443


In [None]:
pieces = [df[:3], df[3:7], df[7:]]

In [97]:
pieces

[          0         1         2         3
 0 -0.861258 -0.141494 -1.945556  0.883121
 1  2.200086 -0.324974 -0.545710  1.097388
 2 -1.153871 -0.053885 -0.102897  1.881248,
           0         1         2         3
 3 -0.976569 -0.503201 -0.719655  0.335012
 4 -0.046094 -0.898358 -0.552782  0.265985
 5  2.064766  0.294196  0.731989  0.064205
 6 -2.320093 -0.995963 -0.878379 -0.363436,
           0         1         2         3
 7 -0.102406 -1.564402 -0.803220 -1.075716
 8 -1.076120 -0.700521 -0.156769 -0.930605
 9 -0.610936  0.287967  0.312841  1.045443]

In [96]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.861258,-0.141494,-1.945556,0.883121
1,2.200086,-0.324974,-0.54571,1.097388
2,-1.153871,-0.053885,-0.102897,1.881248
3,-0.976569,-0.503201,-0.719655,0.335012
4,-0.046094,-0.898358,-0.552782,0.265985
5,2.064766,0.294196,0.731989,0.064205
6,-2.320093,-0.995963,-0.878379,-0.363436
7,-0.102406,-1.564402,-0.80322,-1.075716
8,-1.07612,-0.700521,-0.156769,-0.930605
9,-0.610936,0.287967,0.312841,1.045443


In [98]:
# Join two dataframes

In [99]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [100]:
pd.merge(left, right, on='key')  # default: inner join

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [101]:
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## Grouping

In [102]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                       'C': np.random.randn(8),
                       'D': np.random.randn(8)})

In [104]:
df

Unnamed: 0,A,B,C,D
0,foo,one,1.431182,-1.144329
1,bar,one,0.159358,-0.275041
2,foo,two,1.059482,0.586725
3,bar,three,0.590564,1.137198
4,foo,two,-0.073096,-0.40193
5,bar,two,0.341454,-1.871526
6,foo,one,1.075132,-0.879983
7,foo,three,-0.290931,-0.100376


In [103]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.091376,-1.009369
foo,3.201769,-1.939893


We can also group by multiple columns. This operation will create a new DataFrame with Multilevel Index.

In [105]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.159358,-0.275041
bar,three,0.590564,1.137198
bar,two,0.341454,-1.871526
foo,one,2.506314,-2.024312
foo,three,-0.290931,-0.100376
foo,two,0.986386,0.184795


You cannot apply two aggregation functions in 1 `groupby` statement in Pandas. Instead, use the `.agg()` method.

In [106]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,1.091376,1.137198
foo,3.201769,0.586725


## Pandas Reshaping

#### Stack

In [108]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                         'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [109]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [111]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.828481,0.339499
bar,two,0.598141,0.946744
baz,one,-0.398822,1.337687
baz,two,-0.134828,0.106328
foo,one,-1.268149,-0.615487
foo,two,0.78332,-1.271159
qux,one,1.774088,1.08429
qux,two,-0.126247,-0.513608


In [113]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.828481,0.339499
bar,two,0.598141,0.946744
baz,one,-0.398822,1.337687
baz,two,-0.134828,0.106328


In [118]:
stacked = df2.stack()

In [119]:
stacked

first  second   
bar    one     A   -0.828481
               B    0.339499
       two     A    0.598141
               B    0.946744
baz    one     A   -0.398822
               B    1.337687
       two     A   -0.134828
               B    0.106328
dtype: float64

In [120]:
type(stacked)

pandas.core.series.Series

In [121]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.828481,0.339499
bar,two,0.598141,0.946744
baz,one,-0.398822,1.337687
baz,two,-0.134828,0.106328


In [122]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.828481,0.598141
bar,B,0.339499,0.946744
baz,A,-0.398822,-0.134828
baz,B,1.337687,0.106328


In [123]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.828481,-0.398822
one,B,0.339499,1.337687
two,A,0.598141,-0.134828
two,B,0.946744,0.106328


#### Pivot Tables

In [124]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                       'B': ['A', 'B', 'C'] * 4,
                       'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D': np.random.randn(12),
                       'E': np.random.randn(12)})

In [126]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.27196,-0.942394
1,one,B,foo,-0.893638,-0.369745
2,two,C,foo,1.098605,0.638785
3,three,A,bar,-0.402905,-0.978484
4,one,B,bar,-0.428954,0.042836
5,one,C,bar,0.27024,0.09233
6,two,A,foo,-0.843803,0.070458
7,three,B,foo,-0.143307,-0.947687
8,one,C,foo,-1.113711,-2.752105
9,one,A,bar,-0.351405,0.20982


In [125]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.351405,-0.27196
one,B,-0.428954,-0.893638
one,C,0.27024,-1.113711
three,A,-0.402905,
three,B,,-0.143307
three,C,0.197952,
two,A,,-0.843803
two,B,-1.51948,
two,C,,1.098605


## Pandas Apply Functions

#### Tablewise function application

In [133]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
    return df

def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    return df

df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})

add_country_name(extract_city_name(df_p), country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [134]:
(df_p.pipe(extract_city_name)
         .pipe(add_country_name, country_name="US"))

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


#### Row or column-wise function application

In [147]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [148]:
df

Unnamed: 0,one,two,three
a,-0.570868,0.437771,
b,2.928962,1.399179,-0.061453
c,1.250161,1.0151,-1.686467
d,,1.270631,0.302602


In [149]:
# apply pre-built numpy function
df.apply(np.mean)

one      1.202751
two      1.030670
three   -0.481772
dtype: float64

In [150]:
# apply pre-built numpy function
df.apply(np.mean, axis=1)

a   -0.066549
b    1.422229
c    0.192931
d    0.786617
dtype: float64

In [151]:
# apply own lambda function
df.apply(lambda x: x.max() - x.min())

one      3.499830
two      0.961409
three    1.989069
dtype: float64

In [152]:
# apply pre-built numpy function
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,-0.570868,0.437771,
b,2.358094,1.83695,-0.061453
c,3.608254,2.85205,-1.74792
d,,4.122681,-1.445317


In [153]:
# pre-build numpy function
df.apply(np.exp)

Unnamed: 0,one,two,three
a,0.565035,1.54925,
b,18.708195,4.051874,0.940397
c,3.490904,2.759639,0.185173
d,,3.563099,1.353376


In [154]:
def own_function(x):
    return x*x

In [155]:
df.apply(own_function)

Unnamed: 0,one,two,three
a,0.32589,0.191643,
b,8.578817,1.957703,0.003776
c,1.562902,1.030428,2.844169
d,,1.614502,0.091568


In [156]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

In [157]:
df.apply(subtract_and_divide, args=(5,3))

Unnamed: 0,one,two,three
a,-1.856956,-1.520743,
b,-0.690346,-1.200274,-1.687151
c,-1.249946,-1.3283,-2.228822
d,,-1.243123,-1.565799


Note: `args` has to be iterable. Therefore, even if you pass only 1 argument, you have to pass it as a tuple:

`args=(5,)`

In [158]:
def subtract(x, sub):
    return (x - sub)

df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-5.570868,-4.562229,
b,-2.071038,-3.600821,-5.061453
c,-3.749839,-3.9849,-6.686467
d,,-3.729369,-4.697398
