# Pandas: Reading and Manipulating CSV Files

In [41]:
import numpy as np
import pandas as pd

In [42]:
df = pd.DataFrame({
  'name' : ["a", "b", "c", "d", "e","f", "g"],
  'age' : [20,27, 35, 55, 18, 21, 35],
  'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]
})

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [43]:
df.to_csv('./_data/example.csv')

In [44]:
df_read = pd.read_csv('./_data/example.csv', index_col=0, header=0)
df_read

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [45]:
df_read.set_index('name')

Unnamed: 0_level_0,age,designation
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


# Pandas Basics 2

## Boolean Comparisons

In [46]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

df2 = df.copy()

In [47]:
df

Unnamed: 0,one,two,three
a,-0.80024,0.604595,
b,1.481403,-1.742646,-0.386179
c,-0.257375,2.386778,0.196866
d,,0.231095,1.504217


In [48]:
df.gt(df2) # greater than

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


Note that `np.nan == np.nan` returns `False`.

In [49]:
df2.ne(df) # not equal

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [50]:
df2.eq(df) # equal

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [51]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [52]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [53]:
(df > 0).any().all()

True

In [54]:
(df > 0).any().any()

True

## Objects Comparisons

In [55]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [56]:
# element-wise comparison between arrays
# arrays must be of equal length to compare, otherwise it will result in an error
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [57]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [58]:
(df + df == df * 2).all().all()

False

In [59]:
# to evaluate nans as equals, use the equals method
(df + df).equals(df * 2)

True

## Descriptive Statistics

In [62]:
# aggregate each column
df.mean(0)

one      0.141263
two      0.369955
three    0.438301
dtype: float64

In [63]:
# aggregate each index
df.mean(1)

a   -0.097823
b   -0.215808
c    0.775423
d    0.867656
dtype: float64

By applying vectorized operations, various statistical procedures can be described.

In [65]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

For convenience, the `describe()` method can be called to create a table of the most used statistical descriptors.

In [66]:
df.describe()

Unnamed: 0,one,two,three
count,3.0,4.0,3.0
mean,0.141263,0.369955,0.438301
std,1.191913,1.693615,0.968048
min,-0.80024,-1.742646,-0.386179
25%,-0.528807,-0.26234,-0.094656
50%,-0.257375,0.417845,0.196866
75%,0.612014,1.05014,0.850542
max,1.481403,2.386778,1.504217


`describe()` can also be used to summarize non-numerical data.

In [68]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Indices of Minimum and Maximum Values

In [69]:
s1 = pd.Series(np.random.randn(5))
s1

0   -3.069321
1    0.940068
2   -0.722736
3    2.366161
4    0.285430
dtype: float64

In [71]:
s1.idxmin(), s1.idxmax()

(0, 3)

In [72]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,0.742538,0.847782,0.709761
1,-1.616103,-0.545731,0.846971
2,0.155982,-0.630523,0.274939
3,1.18856,-1.72507,0.059133
4,0.790463,0.713559,-0.467424


In [73]:
df1.idxmin(axis=0)

A    1
B    3
C    4
dtype: int64

In [74]:
df1.idxmax(axis=1)

0    B
1    C
2    C
3    A
4    A
dtype: object

## Iterations
The behaviour of basic iterations over `pandas` objects depends on the type. WHen iterating over a `Series`, it is regarded as array-like and basic iterations produces the values. `DataFrames` follow the dict-like convention of iterating over the `keys` of the objects.

In short:
- `Series` produces values
- `DataFrame` produces column labels
To iterate over the rows of a `DataFrame`:
- `items()` to iterate over the (`key`, `value`) pairs
- `iterrows()` to iterate over the rows of a `DataFrame` as (`Index`, `Series`) pairs. This converts the rows to `Series` objects, which can change the `dtypes` and has some performance implications
- `itertuples()` to iterate over the rows of a `DataFrame` as `namedtuples` of the values. This is faster than `iterrows()` and is preferable to use to iterate over the values of a `DataFrame`

Iterating through `Pandas` objects is generally slow. In many cases, iterating manually over the rows is not needed and can be avoided.

### `items()`
Consistent with the dict-like interface, `items()` iterates through `key`/`value` pairs.
- `Series`produces (`index`, `scalar`-value) pairs
- `DataFrame` produces (`column`, `Series`) pairs

In [75]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [77]:
for label, ser in df.items():
  print(label)
  print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### `iterrows()`
`iterrows()` iterates through rows of a `DataFrame` as `Series` objects. It returns an iterator yielding each `index` value along with a `Series` containing the data in each row.

In [78]:
for row_index, row in df.iterrows():
  print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### `itertuple()`
`itertuples()` will return an iterator yielding a `namedtuple` for each row in the `DataFrame`. The first element of the `tuple` will be the row's corresponding `index` value, while the remaining values are the row values.

In [83]:
for row in df.itertuples():
  print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')
