# Pandas: Reading and Manipulating CSV Files

In [98]:
import numpy as np
import pandas as pd

In [99]:
df = pd.DataFrame({
  'name' : ["a", "b", "c", "d", "e","f", "g"],
  'age' : [20,27, 35, 55, 18, 21, 35],
  'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]
})

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [100]:
df.to_csv('./_data/example.csv')

In [101]:
df_read = pd.read_csv('./_data/example.csv', index_col=0, header=0)
df_read

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [102]:
df_read.set_index('name')

Unnamed: 0_level_0,age,designation
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


# Pandas Basics 2

## Boolean Comparisons

In [103]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

df2 = df.copy()

In [104]:
df

Unnamed: 0,one,two,three
a,0.522104,0.121961,
b,-1.445086,-1.217037,0.607303
c,-0.395248,1.198273,-1.052488
d,,-0.841562,1.224351


In [105]:
df.gt(df2) # greater than

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


Note that `np.nan == np.nan` returns `False`.

In [106]:
df2.ne(df) # not equal

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [107]:
df2.eq(df) # equal

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [108]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [109]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [110]:
(df > 0).any().all()

True

In [111]:
(df > 0).any().any()

True

## Objects Comparisons

In [112]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [113]:
# element-wise comparison between arrays
# arrays must be of equal length to compare, otherwise it will result in an error
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [114]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [115]:
(df + df == df * 2).all().all()

False

In [116]:
# to evaluate nans as equals, use the equals method
(df + df).equals(df * 2)

True

## Descriptive Statistics

In [117]:
# aggregate each column
df.mean(0)

one     -0.439410
two     -0.184591
three    0.259722
dtype: float64

In [118]:
# aggregate each index
df.mean(1)

a    0.322033
b   -0.684940
c   -0.083154
d    0.191395
dtype: float64

By applying vectorized operations, various statistical procedures can be described.

In [119]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

For convenience, the `describe()` method can be called to create a table of the most used statistical descriptors.

In [120]:
df.describe()

Unnamed: 0,one,two,three
count,3.0,4.0,3.0
mean,-0.43941,-0.184591,0.259722
std,0.984338,1.080716,1.177544
min,-1.445086,-1.217037,-1.052488
25%,-0.920167,-0.935431,-0.222592
50%,-0.395248,-0.3598,0.607303
75%,0.063428,0.391039,0.915827
max,0.522104,1.198273,1.224351


`describe()` can also be used to summarize non-numerical data.

In [121]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Indices of Minimum and Maximum Values

In [122]:
s1 = pd.Series(np.random.randn(5))
s1

0   -0.388874
1   -1.396360
2    0.795000
3    1.806742
4    2.297276
dtype: float64

In [123]:
s1.idxmin(), s1.idxmax()

(1, 4)

In [124]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-1.998147,-0.101697,0.703179
1,-0.70497,0.70356,0.582303
2,-1.49924,-1.120836,0.642003
3,1.183271,0.075758,-0.655817
4,-0.462932,0.771405,0.046178


In [125]:
df1.idxmin(axis=0)

A    0
B    2
C    3
dtype: int64

In [126]:
df1.idxmax(axis=1)

0    C
1    B
2    C
3    A
4    B
dtype: object

## Iterations
The behaviour of basic iterations over `pandas` objects depends on the type. WHen iterating over a `Series`, it is regarded as array-like and basic iterations produces the values. `DataFrames` follow the dict-like convention of iterating over the `keys` of the objects.

In short:
- `Series` produces values
- `DataFrame` produces column labels
To iterate over the rows of a `DataFrame`:
- `items()` to iterate over the (`key`, `value`) pairs
- `iterrows()` to iterate over the rows of a `DataFrame` as (`Index`, `Series`) pairs. This converts the rows to `Series` objects, which can change the `dtypes` and has some performance implications
- `itertuples()` to iterate over the rows of a `DataFrame` as `namedtuples` of the values. This is faster than `iterrows()` and is preferable to use to iterate over the values of a `DataFrame`

Iterating through `Pandas` objects is generally slow. In many cases, iterating manually over the rows is not needed and can be avoided.

### `items()`
Consistent with the dict-like interface, `items()` iterates through `key`/`value` pairs.
- `Series`produces (`index`, `scalar`-value) pairs
- `DataFrame` produces (`column`, `Series`) pairs

In [127]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [128]:
for label, ser in df.items():
  print(label)
  print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### `iterrows()`
`iterrows()` iterates through rows of a `DataFrame` as `Series` objects. It returns an iterator yielding each `index` value along with a `Series` containing the data in each row.

In [129]:
for row_index, row in df.iterrows():
  print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### `itertuple()`
`itertuples()` will return an iterator yielding a `namedtuple` for each row in the `DataFrame`. The first element of the `tuple` will be the row's corresponding `index` value, while the remaining values are the row values.

In [130]:
for row in df.itertuples():
  print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


# Pandas Viewing

In [131]:
import numpy as np
import pandas as pd

## Object Creation

In [132]:
s = pd.Series([1, 3, 5, np.nan, 6 ,8 ])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [133]:
dates = pd.date_range('20130101', periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [134]:
df2 = pd.DataFrame({
  'A': 1.0,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Viewing Data

In [135]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [136]:
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [137]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [138]:
df.columns

Index(['a', 'b'], dtype='object')

In [139]:
df.to_numpy()

array([[1, 'a'],
       [2, 'b'],
       [3, 'c']], dtype=object)

`NumPy` arrays have one `dtype` for the entire array, while `DataFrames` allow one `dtype` per column. When `to_numpy()` is called on a `DataFrame`, `Pandas` will find the `NumPy` `dtype` that can hold all of the `dtypes` in the `DataFrame`. Given a dataset of different `dtypes`, this conversion results in `Numpy` interpreting the entire set as an `object`, which can be expensive (time- and memory-consuming) to navigate.

In [140]:
df.describe()

Unnamed: 0,a
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [141]:
df.T

Unnamed: 0,0,1,2
a,1,2,3
b,a,b,c


In [142]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,b,a
0,a,1
1,b,2
2,c,3


In [143]:
df.sort_values(by='b')

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


# Pandas Accessing

In [144]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

## Getting

In [145]:
df['A']

2013-01-01    0.661819
2013-01-02   -0.708180
2013-01-03   -0.888874
2013-01-04   -0.725399
2013-01-05    1.061774
2013-01-06   -0.460668
Freq: D, Name: A, dtype: float64

In [146]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.661819,0.554899,-1.475403,1.535867
2013-01-02,-0.70818,-1.196534,0.083984,-0.689577
2013-01-03,-0.888874,1.51449,0.43987,-1.377084


In [147]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.70818,-1.196534,0.083984,-0.689577
2013-01-03,-0.888874,1.51449,0.43987,-1.377084
2013-01-04,-0.725399,-0.469438,1.160348,-0.10571


In [148]:
df.loc['2013-01-01']

A    0.661819
B    0.554899
C   -1.475403
D    1.535867
Name: 2013-01-01 00:00:00, dtype: float64

In [149]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.661819,0.554899
2013-01-02,-0.70818,-1.196534
2013-01-03,-0.888874,1.51449
2013-01-04,-0.725399,-0.469438
2013-01-05,1.061774,0.098819
2013-01-06,-0.460668,0.156003


In [150]:
df.loc['20130102': '20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.70818,-1.196534
2013-01-03,-0.888874,1.51449
2013-01-04,-0.725399,-0.469438


The data type of the returned object is automatically changed based on the dimension of the object.

In [153]:
df.loc['20130102', ['A', 'B']] # returns a series

A   -0.708180
B   -1.196534
Name: 2013-01-02 00:00:00, dtype: float64

In [154]:
df.loc[dates[0], 'A'] # returns a scalar value

0.6618188564403709

## Selection by `dtype`

In [158]:
df = pd.DataFrame({
  'string': list('abc'),
  'int64': list(range(1, 4)),
  'uint8': np.arange(3, 6).astype('u1'),
  'float64': np.arange(4.0, 7.0),
  'bool1': [True, False, True],
  'bool2': [False, True, False],
  'dates': pd.date_range('now', periods=3),
  'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-09-03 17:03:31.058337,A
1,b,2,4,5.0,False,True,2023-09-04 17:03:31.058337,B
2,c,3,5,6.0,True,False,2023-09-05 17:03:31.058337,C


In [159]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


## Boolean indexing

In [160]:
df2[df2['A'] >= 0]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [162]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']

In [163]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
1,b,2,4,5.0,False,True,2023-09-04 17:03:31.058337,B,two


In [164]:
# setting values by position
df.iat[0, 1] = -1
df.iloc[0, 1] = 2

# setting values by label
df.at['2013-01-01', 'A'] = -10
df.loc['2013-01-02', 'B'] = -20

# setting by assigning with a `NumPy` array
df.loc[:, 'C'] = np.array([50] * len(df))

In [165]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,A,B,C
0,a,2.0,3.0,4.0,True,False,2023-09-03 17:03:31.058337,A,,,50
1,b,2.0,4.0,5.0,False,True,2023-09-04 17:03:31.058337,B,,,50
2,c,3.0,5.0,6.0,True,False,2023-09-05 17:03:31.058337,C,,,50
2013-01-01,,,,,,,NaT,,-10.0,,50
2013-01-02,,,,,,,NaT,,,-20.0,50
