In [42]:
import numpy as np
import pandas as pd

## Boolean comparisons

Series and DataFrame have the binary comparison methods eq, ne, lt, gt, le, ge whose behavior is vectorized.

Note: np.nan == np.nan returns False

In [21]:
df = pd.DataFrame({
       'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
       'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
       'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df2 = df.copy()
df2


Unnamed: 0,one,two,three
a,1.277004,0.73209,
b,0.858675,0.794279,0.263021
c,0.001258,0.80394,0.71417
d,,0.0601,-0.91006


In [8]:
df.eq(df2)

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [3]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


Reductions: empty, any(), all(), bool()

In [22]:
(df > 0).all()

one      False
two       True
three    False
dtype: bool

In [24]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [25]:
(df > 0).any().any()

True

To evalutate single-element pandas objects use bool()

In [26]:
pd.Series([True]).bool()

True

In [28]:
pd.DataFrame([[True]]).bool()

True

## Object comparison

In [29]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [31]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

Careful with NAN

In [32]:
(df + df == df * 2)

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [33]:
np.nan == np.nan

False

In [34]:
(df + df).equals(df * 2)

True

## Descriptive Statistics

In [35]:
df.mean(0)

one      0.712312
two      0.597602
three    0.022377
dtype: float64

In [36]:
df.mean(1)

a    1.004547
b    0.638658
c    0.506456
d   -0.424980
dtype: float64

Standardiztion => rendering data zero mean and standard deviation 1

In [38]:
ts_stand = (df - df.mean())/df.std()
ts_stand

Unnamed: 0,one,two,three
a,0.868296,0.373841,
b,0.225054,0.546709,0.287017
c,-1.09335,0.573565,0.825107
d,,-1.494114,-1.112124


In [39]:
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

## Describe

In [40]:
series = pd.Series(np.random.randn(1000))
series[::2] = np.nan
series.describe()

count    500.000000
mean       0.013481
std        1.000741
min       -3.095179
25%       -0.639082
50%        0.058618
75%        0.669581
max        2.736552
dtype: float64

In [46]:
frame = pd.DataFrame(np.random.rand(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
frame.iloc[::2] = np.nan
frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,0.489071,0.519899,0.498278,0.492544,0.510133
std,0.287876,0.280541,0.284346,0.290087,0.293504
min,0.004715,0.006252,0.000764,0.00257,0.002479
25%,0.257738,0.274585,0.263253,0.236047,0.253183
50%,0.464603,0.510073,0.50098,0.483856,0.529868
75%,0.732618,0.775266,0.730575,0.739839,0.760048
max,0.999419,0.998628,0.997379,0.999482,0.995506


For non-numberical Series object, describe() will give a simple summary of the number of unique values

In [47]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Index of min/max values

In [48]:
s1 = pd.Series(np.random.randn(5))
s1

0    0.331730
1    0.769967
2   -0.243468
3    0.888852
4    0.344207
dtype: float64

In [50]:
s1.idxmin(), s1.idxmax()

(2, 3)

In [53]:
df1 = pd.DataFrame(np.random.randn(5,3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-1.132923,0.475811,0.008192
1,0.873008,1.77852,-1.139381
2,0.303268,2.644056,0.559425
3,-0.301247,0.148839,1.799947
4,-0.790245,-0.921302,0.15019


In [54]:
df1.idxmin(axis=0)

A    0
B    4
C    1
dtype: int64

In [55]:
df1.idxmax(axis=1)

0    B
1    B
2    B
3    C
4    C
dtype: object

## Iterations

In [57]:
df = pd.DataFrame({'col1': np.random.randn(3),
                     'col2': np.random.randn(3)}, index=['a', 'b', 'c'])
for col in df:
    print(col)

col1
col2


items(): to iterate over the (key, value) pairs.

iterrows(): Iterate over the rows of a DataFrame as (index, Series) pairs. This converts the rows to Series objects, which can change the dtypes and has some performance implications.

itertuples(): Iterate over the rows of a DataFrame as namedtuples of the values. This is a lot faster than iterrows() and is in most cases preferable to use to iterate over the values of a DataFrame.

### items

In [61]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
for label, ser in df.items():
    print(label)
    print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### iterrows

In [62]:
for row_index, row in df.iterrows():
    print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### itertuple

In [63]:
for row in df.itertuples():
    print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
