# Pandas: Reading and Manipulating CSV Files

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({
  'name' : ["a", "b", "c", "d", "e","f", "g"],
  'age' : [20,27, 35, 55, 18, 21, 35],
  'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]
})

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [4]:
df.to_csv('./_data/example.csv')

In [5]:
df_read = pd.read_csv('./_data/example.csv', index_col=0, header=0)
df_read

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [6]:
df_read.set_index('name')

Unnamed: 0_level_0,age,designation
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


# Pandas Basics 2

## Boolean Comparisons

In [7]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

df2 = df.copy()

In [8]:
df

Unnamed: 0,one,two,three
a,0.024984,-0.91019,
b,-0.483034,-0.245843,0.567855
c,-0.818315,-0.251364,-0.455023
d,,1.616068,-2.359375


In [9]:
df.gt(df2) # greater than

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


Note that `np.nan == np.nan` returns `False`.

In [10]:
df2.ne(df) # not equal

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [11]:
df2.eq(df) # equal

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [12]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [13]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [14]:
(df > 0).any().all()

True

In [15]:
(df > 0).any().any()

True

## Objects Comparisons

In [16]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [17]:
# element-wise comparison between arrays
# arrays must be of equal length to compare, otherwise it will result in an error
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [18]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [19]:
(df + df == df * 2).all().all()

False

In [20]:
# to evaluate nans as equals, use the equals method
(df + df).equals(df * 2)

True

## Descriptive Statistics

In [21]:
# aggregate each column
df.mean(0)

one     -0.425455
two      0.052168
three   -0.748848
dtype: float64

In [22]:
# aggregate each index
df.mean(1)

a   -0.442603
b   -0.053674
c   -0.508234
d   -0.371653
dtype: float64

By applying vectorized operations, various statistical procedures can be described.

In [23]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

For convenience, the `describe()` method can be called to create a table of the most used statistical descriptors.

In [24]:
df.describe()

Unnamed: 0,one,two,three
count,3.0,4.0,3.0
mean,-0.425455,0.052168,-0.748848
std,0.424588,1.088249,1.48557
min,-0.818315,-0.91019,-2.359375
25%,-0.650674,-0.416071,-1.407199
50%,-0.483034,-0.248604,-0.455023
75%,-0.229025,0.219635,0.056416
max,0.024984,1.616068,0.567855


`describe()` can also be used to summarize non-numerical data.

In [25]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Indices of Minimum and Maximum Values

In [26]:
s1 = pd.Series(np.random.randn(5))
s1

0    0.154934
1   -1.518838
2    1.537280
3   -0.689408
4    0.602638
dtype: float64

In [27]:
s1.idxmin(), s1.idxmax()

(1, 2)

In [28]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,0.302816,-0.474942,0.204456
1,-1.777821,-2.212828,-0.833255
2,-1.014529,0.265533,-0.148668
3,-0.954154,-0.947179,0.069555
4,0.880384,0.475365,-1.576511


In [29]:
df1.idxmin(axis=0)

A    1
B    1
C    4
dtype: int64

In [30]:
df1.idxmax(axis=1)

0    A
1    C
2    B
3    C
4    A
dtype: object

## Iterations
The behaviour of basic iterations over `pandas` objects depends on the type. WHen iterating over a `Series`, it is regarded as array-like and basic iterations produces the values. `DataFrames` follow the dict-like convention of iterating over the `keys` of the objects.

In short:
- `Series` produces values
- `DataFrame` produces column labels
To iterate over the rows of a `DataFrame`:
- `items()` to iterate over the (`key`, `value`) pairs
- `iterrows()` to iterate over the rows of a `DataFrame` as (`Index`, `Series`) pairs. This converts the rows to `Series` objects, which can change the `dtypes` and has some performance implications
- `itertuples()` to iterate over the rows of a `DataFrame` as `namedtuples` of the values. This is faster than `iterrows()` and is preferable to use to iterate over the values of a `DataFrame`

Iterating through `Pandas` objects is generally slow. In many cases, iterating manually over the rows is not needed and can be avoided.

### `items()`
Consistent with the dict-like interface, `items()` iterates through `key`/`value` pairs.
- `Series`produces (`index`, `scalar`-value) pairs
- `DataFrame` produces (`column`, `Series`) pairs

In [31]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [32]:
for label, ser in df.items():
  print(label)
  print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### `iterrows()`
`iterrows()` iterates through rows of a `DataFrame` as `Series` objects. It returns an iterator yielding each `index` value along with a `Series` containing the data in each row.

In [33]:
for row_index, row in df.iterrows():
  print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### `itertuple()`
`itertuples()` will return an iterator yielding a `namedtuple` for each row in the `DataFrame`. The first element of the `tuple` will be the row's corresponding `index` value, while the remaining values are the row values.

In [34]:
for row in df.itertuples():
  print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


# Pandas Viewing

In [35]:
import numpy as np
import pandas as pd

## Object Creation

In [36]:
s = pd.Series([1, 3, 5, np.nan, 6 ,8 ])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [37]:
dates = pd.date_range('20130101', periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [38]:
df2 = pd.DataFrame({
  'A': 1.0,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Viewing Data

In [39]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [40]:
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [41]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [42]:
df.columns

Index(['a', 'b'], dtype='object')

In [43]:
df.to_numpy()

array([[1, 'a'],
       [2, 'b'],
       [3, 'c']], dtype=object)

`NumPy` arrays have one `dtype` for the entire array, while `DataFrames` allow one `dtype` per column. When `to_numpy()` is called on a `DataFrame`, `Pandas` will find the `NumPy` `dtype` that can hold all of the `dtypes` in the `DataFrame`. Given a dataset of different `dtypes`, this conversion results in `Numpy` interpreting the entire set as an `object`, which can be expensive (time- and memory-consuming) to navigate.

In [44]:
df.describe()

Unnamed: 0,a
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [45]:
df.T

Unnamed: 0,0,1,2
a,1,2,3
b,a,b,c


In [46]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,b,a
0,a,1
1,b,2
2,c,3


In [47]:
df.sort_values(by='b')

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


# Pandas Accessing

In [48]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

## Getting

In [49]:
df['A']

2013-01-01    0.858009
2013-01-02   -0.099477
2013-01-03    0.632284
2013-01-04    0.365969
2013-01-05   -0.557254
2013-01-06    1.130587
Freq: D, Name: A, dtype: float64

In [50]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.858009,-0.011586,-0.834347,2.593003
2013-01-02,-0.099477,-1.653918,1.052052,0.304499
2013-01-03,0.632284,-0.006523,-0.056099,0.986901


In [51]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.099477,-1.653918,1.052052,0.304499
2013-01-03,0.632284,-0.006523,-0.056099,0.986901
2013-01-04,0.365969,1.448735,1.281085,1.013499


In [52]:
df.loc['2013-01-01']

A    0.858009
B   -0.011586
C   -0.834347
D    2.593003
Name: 2013-01-01 00:00:00, dtype: float64

In [53]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.858009,-0.011586
2013-01-02,-0.099477,-1.653918
2013-01-03,0.632284,-0.006523
2013-01-04,0.365969,1.448735
2013-01-05,-0.557254,-1.489697
2013-01-06,1.130587,-0.406779


In [54]:
df.loc['20130102': '20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.099477,-1.653918
2013-01-03,0.632284,-0.006523
2013-01-04,0.365969,1.448735


The data type of the returned object is automatically changed based on the dimension of the object.

In [55]:
df.loc['20130102', ['A', 'B']] # returns a series

A   -0.099477
B   -1.653918
Name: 2013-01-02 00:00:00, dtype: float64

In [56]:
df.loc[dates[0], 'A'] # returns a scalar value

0.8580094679124004

## Selection by `dtype`

In [57]:
df = pd.DataFrame({
  'string': list('abc'),
  'int64': list(range(1, 4)),
  'uint8': np.arange(3, 6).astype('u1'),
  'float64': np.arange(4.0, 7.0),
  'bool1': [True, False, True],
  'bool2': [False, True, False],
  'dates': pd.date_range('now', periods=3),
  'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-09-07 21:48:14.805293,A
1,b,2,4,5.0,False,True,2023-09-08 21:48:14.805293,B
2,c,3,5,6.0,True,False,2023-09-09 21:48:14.805293,C


In [58]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


## Boolean indexing

In [59]:
df2[df2['A'] >= 0]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [60]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']

In [61]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
1,b,2,4,5.0,False,True,2023-09-08 21:48:14.805293,B,two


In [62]:
# setting values by position
df.iat[0, 1] = -1
df.iloc[0, 1] = 2

# setting values by label
df.at['2013-01-01', 'A'] = -10
df.loc['2013-01-02', 'B'] = -20

# setting by assigning with a `NumPy` array
df.loc[:, 'C'] = np.array([50] * len(df))

In [63]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,A,B,C
0,a,2.0,3.0,4.0,True,False,2023-09-07 21:48:14.805293,A,,,50
1,b,2.0,4.0,5.0,False,True,2023-09-08 21:48:14.805293,B,,,50
2,c,3.0,5.0,6.0,True,False,2023-09-09 21:48:14.805293,C,,,50
2013-01-01,,,,,,,NaT,,-10.0,,50
2013-01-02,,,,,,,NaT,,,-20.0,50


# Pandas `merge` and `groupby`

## `merge`

`pandas` provides various facilities for easily combining `Series` and `DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of `join`/`merge`-type operations.

In [64]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.831596,-1.807662,0.834911,-0.105749
1,-1.276411,1.527573,-0.173399,-2.045156
2,-0.058334,-0.967119,0.385717,0.324793
3,-0.38915,-1.150992,-0.005913,-0.217964
4,-1.357391,0.690398,-0.157963,-0.130747
5,0.310983,1.871563,-0.015564,-0.767736
6,-1.347052,-0.315735,-0.162345,1.087668
7,-0.540738,-2.018482,0.396066,-0.641917
8,-2.046139,-0.391993,-2.103878,0.051924
9,0.456483,-1.559211,-0.738133,0.783015


In [65]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.831596,-1.807662,0.834911,-0.105749
1,-1.276411,1.527573,-0.173399,-2.045156
2,-0.058334,-0.967119,0.385717,0.324793
3,-0.38915,-1.150992,-0.005913,-0.217964
4,-1.357391,0.690398,-0.157963,-0.130747
5,0.310983,1.871563,-0.015564,-0.767736
6,-1.347052,-0.315735,-0.162345,1.087668
7,-0.540738,-2.018482,0.396066,-0.641917
8,-2.046139,-0.391993,-2.103878,0.051924
9,0.456483,-1.559211,-0.738133,0.783015


In [66]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

# make an inner join between tables created above on column 'key'
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [67]:
# make an outer join between tables created above on column 'key'
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## `groupby`
This process involves the following steps:
- splitting the data into groups based on some criteria
- applying a function to each group independently
- combining the results into a data structure

In [68]:
df = pd.DataFrame({
  'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  'C': np.random.randn(8),
  'D': np.random.randn(8)
})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.173404,-0.601427
1,bar,one,1.368951,0.47376
2,foo,two,-1.595779,-0.135189
3,bar,three,0.894911,-0.860241
4,foo,two,-0.694474,-1.939723
5,bar,two,0.143305,1.034717
6,foo,one,-1.933586,1.061538
7,foo,three,-0.514218,0.101734


In [69]:
df.groupby('A').sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,onethreetwo,2.407168,0.648236
foo,onetwotwoonethree,-4.564654,-1.513067


In [70]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.368951,0.47376
bar,three,0.894911,-0.860241
bar,two,0.143305,1.034717
foo,one,-1.760182,0.460112
foo,three,-0.514218,0.101734
foo,two,-2.290253,-2.074912


In [71]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,2.407168,1.034717
foo,-4.564654,1.061538


## Pandas Reshaping

## `stack`
The `stack()` method 'compresses' a level in the DataFrame's columns.

In [72]:
tuples = list(zip(*[
  ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
  ]))

tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [73]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [74]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.675202,-1.405056
bar,two,0.299552,-1.426453
baz,one,-0.893039,-1.142935
baz,two,1.839393,-0.249749
foo,one,-0.024802,-1.110505
foo,two,1.17584,1.548019
qux,one,1.862764,0.395311
qux,two,-1.157785,-0.338647


In [75]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.675202,-1.405056
bar,two,0.299552,-1.426453
baz,one,-0.893039,-1.142935
baz,two,1.839393,-0.249749


In [76]:
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.675202
               B   -1.405056
       two     A    0.299552
               B   -1.426453
baz    one     A   -0.893039
               B   -1.142935
       two     A    1.839393
               B   -0.249749
dtype: float64

In [77]:
stacked.dtype

dtype('float64')

With a `stacked` `DataFrame` or `Series` with multiple indices, the inverse operation `unstack()` will unstack the dataset at the **last level**.

In [78]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.675202,-1.405056
bar,two,0.299552,-1.426453
baz,one,-0.893039,-1.142935
baz,two,1.839393,-0.249749


## Pivot Tables

In [79]:
df = pd.DataFrame({
  'A': ['one', 'one', 'two', 'three'] * 3,
  'B': ['A', 'B', 'C'] * 4,
  'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
  'D': np.random.randn(12),
  'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-1.019536,0.918883
1,one,B,foo,-0.2585,-0.500168
2,two,C,foo,-0.200054,0.223771
3,three,A,bar,0.291704,-0.851079
4,one,B,bar,0.011999,2.028179
5,one,C,bar,-1.337644,0.527949
6,two,A,foo,-1.997603,-0.229837
7,three,B,foo,1.495962,-1.351752
8,one,C,foo,-0.252545,0.327137
9,one,A,bar,-0.41108,-2.007238


In [80]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.41108,-1.019536
one,B,0.011999,-0.2585
one,C,-1.337644,-0.252545
three,A,0.291704,
three,B,,1.495962
three,C,0.282538,
two,A,,-1.997603
two,B,-0.491903,
two,C,,-0.200054


# Pandas Apply Functions
To apply another function to `Pandas` objects, there are two methods:
- tablewise function application: `pipe()`
- row or column-se function application: `apply()`

## Tablewise Function, `pipe()`

In [81]:
def extract_city_name(df):
  """
  Chicago, IL -> Chicago for `city_name` column
  """
  
  df['city_name'] = df['city_and_code'].str.split(',').str.get(0)
  return df

In [82]:
def add_country_name(df, country_name=None):
  """
  Chicago -> Chicago-US for `city_name` column 
  """
  col = 'city_name'
  df['city_and_country'] = df[col] + country_name
  return df

In [83]:
df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})
add_country_name(extract_city_name(df_p), country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [84]:
df_p.pipe(
  extract_city_name
  ).pipe(
    add_country_name,
    country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


## Row or Column-wise Function Application

Arbitrary functions can be applied along the axes of a `DataFrame` using the `apply()` method, which takes an optional axis argument.

In [85]:
df = pd.DataFrame({
  'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
  'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
  'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})
df

Unnamed: 0,one,two,three
a,1.423483,0.104587,
b,-1.553681,0.024286,0.404061
c,-1.106997,-1.347762,0.085884
d,,-1.104183,0.748294


In [86]:
df.apply(np.mean)

one     -0.412398
two     -0.580768
three    0.412746
dtype: float64

In [87]:
df.apply(np.mean, axis=1)

a    0.764035
b   -0.375111
c   -0.789625
d   -0.177945
dtype: float64

In [88]:
df.apply(lambda x: x.max() - x.min())

one      2.977163
two      1.452350
three    0.662410
dtype: float64

In [89]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,1.423483,0.104587,
b,-0.130198,0.128874,0.404061
c,-1.237195,-1.218889,0.489945
d,,-2.323072,1.238239


In [90]:
df.apply(np.exp)

Unnamed: 0,one,two,three
a,4.151554,1.110252,
b,0.211468,1.024584,1.497895
c,0.33055,0.259821,1.08968
d,,0.331481,2.113392


In [91]:
def own_function(x):
  return x*x

df.apply(own_function)

Unnamed: 0,one,two,three
a,2.026303,0.010938,
b,2.413924,0.00059,0.163265
c,1.225441,1.816464,0.007376
d,,1.219221,0.559944


In [93]:
def subtract_and_divide(x, sub, divide=1):
  return (x - sub) / divide

df.apply(subtract_and_divide, args=(5,3))

Unnamed: 0,one,two,three
a,-1.192172,-1.631804,
b,-2.18456,-1.658571,-1.53198
c,-2.035666,-2.115921,-1.638039
d,,-2.034728,-1.417235


`args` has to be iterable. Even if only 1 argument is passed, it must be passed as a tuple, eg. `(5,)`.

In [95]:
def subtract(x, sub):
  return (x - sub)

df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-3.576517,-4.895413,
b,-6.553681,-4.975714,-4.595939
c,-6.106997,-6.347762,-4.914116
d,,-6.104183,-4.251706
