# Pandas: Reading and Manipulating CSV Files

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({
  'name' : ["a", "b", "c", "d", "e","f", "g"],
  'age' : [20,27, 35, 55, 18, 21, 35],
  'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]
})

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [4]:
df.to_csv('./_data/example.csv')

In [5]:
df_read = pd.read_csv('./_data/example.csv', index_col=0, header=0)
df_read

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [6]:
df_read.set_index('name')

Unnamed: 0_level_0,age,designation
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


# Pandas Basics 2

## Boolean Comparisons

In [7]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

df2 = df.copy()

In [8]:
df

Unnamed: 0,one,two,three
a,0.165992,0.313001,
b,0.854618,0.321083,-0.933479
c,0.709733,-0.066497,0.479029
d,,1.007828,0.261696


In [9]:
df.gt(df2) # greater than

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


Note that `np.nan == np.nan` returns `False`.

In [10]:
df2.ne(df) # not equal

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


In [11]:
df2.eq(df) # equal

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [12]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [13]:
(df > 0).any()

one      True
two      True
three    True
dtype: bool

In [14]:
(df > 0).any().all()

True

In [15]:
(df > 0).any().any()

True

## Objects Comparisons

In [16]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'

0     True
1    False
2    False
dtype: bool

In [17]:
# element-wise comparison between arrays
# arrays must be of equal length to compare, otherwise it will result in an error
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

In [18]:
(df + df == df * 2).all()

one      False
two       True
three    False
dtype: bool

In [19]:
(df + df == df * 2).all().all()

False

In [20]:
# to evaluate nans as equals, use the equals method
(df + df).equals(df * 2)

True

## Descriptive Statistics

In [21]:
# aggregate each column
df.mean(0)

one      0.576781
two      0.393854
three   -0.064251
dtype: float64

In [22]:
# aggregate each index
df.mean(1)

a    0.239497
b    0.080741
c    0.374088
d    0.634762
dtype: float64

By applying vectorized operations, various statistical procedures can be described.

In [23]:
ts_stand = (df - df.mean()) / df.std()
ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

For convenience, the `describe()` method can be called to create a table of the most used statistical descriptors.

In [24]:
df.describe()

Unnamed: 0,one,two,three
count,3.0,4.0,3.0
mean,0.576781,0.393854,-0.064251
std,0.363055,0.447482,0.760576
min,0.165992,-0.066497,-0.933479
25%,0.437863,0.218127,-0.335892
50%,0.709733,0.317042,0.261696
75%,0.782176,0.492769,0.370362
max,0.854618,1.007828,0.479029


`describe()` can also be used to summarize non-numerical data.

In [25]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

## Indices of Minimum and Maximum Values

In [26]:
s1 = pd.Series(np.random.randn(5))
s1

0    0.327925
1   -0.106121
2    1.083878
3   -0.764457
4   -0.016792
dtype: float64

In [27]:
s1.idxmin(), s1.idxmax()

(3, 2)

In [28]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1

Unnamed: 0,A,B,C
0,-0.407979,0.868933,-1.018246
1,-1.827086,0.133828,-0.07114
2,0.190857,-0.314726,1.504471
3,-0.126578,-0.607828,-0.501057
4,-0.697708,-0.062827,1.728351


In [29]:
df1.idxmin(axis=0)

A    1
B    3
C    0
dtype: int64

In [30]:
df1.idxmax(axis=1)

0    B
1    B
2    C
3    A
4    C
dtype: object

## Iterations
The behaviour of basic iterations over `pandas` objects depends on the type. WHen iterating over a `Series`, it is regarded as array-like and basic iterations produces the values. `DataFrames` follow the dict-like convention of iterating over the `keys` of the objects.

In short:
- `Series` produces values
- `DataFrame` produces column labels
To iterate over the rows of a `DataFrame`:
- `items()` to iterate over the (`key`, `value`) pairs
- `iterrows()` to iterate over the rows of a `DataFrame` as (`Index`, `Series`) pairs. This converts the rows to `Series` objects, which can change the `dtypes` and has some performance implications
- `itertuples()` to iterate over the rows of a `DataFrame` as `namedtuples` of the values. This is faster than `iterrows()` and is preferable to use to iterate over the values of a `DataFrame`

Iterating through `Pandas` objects is generally slow. In many cases, iterating manually over the rows is not needed and can be avoided.

### `items()`
Consistent with the dict-like interface, `items()` iterates through `key`/`value` pairs.
- `Series`produces (`index`, `scalar`-value) pairs
- `DataFrame` produces (`column`, `Series`) pairs

In [31]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


In [32]:
for label, ser in df.items():
  print(label)
  print(ser)

a
0    1
1    2
2    3
Name: a, dtype: int64
b
0    a
1    b
2    c
Name: b, dtype: object


### `iterrows()`
`iterrows()` iterates through rows of a `DataFrame` as `Series` objects. It returns an iterator yielding each `index` value along with a `Series` containing the data in each row.

In [33]:
for row_index, row in df.iterrows():
  print(row_index, row, sep='\n')

0
a    1
b    a
Name: 0, dtype: object
1
a    2
b    b
Name: 1, dtype: object
2
a    3
b    c
Name: 2, dtype: object


### `itertuple()`
`itertuples()` will return an iterator yielding a `namedtuple` for each row in the `DataFrame`. The first element of the `tuple` will be the row's corresponding `index` value, while the remaining values are the row values.

In [34]:
for row in df.itertuples():
  print(row)

Pandas(Index=0, a=1, b='a')
Pandas(Index=1, a=2, b='b')
Pandas(Index=2, a=3, b='c')


# Pandas Viewing

In [35]:
import numpy as np
import pandas as pd

## Object Creation

In [36]:
s = pd.Series([1, 3, 5, np.nan, 6 ,8 ])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [37]:
dates = pd.date_range('20130101', periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [38]:
df2 = pd.DataFrame({
  'A': 1.0,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


## Viewing Data

In [39]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [40]:
df2.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [41]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [42]:
df.columns

Index(['a', 'b'], dtype='object')

In [43]:
df.to_numpy()

array([[1, 'a'],
       [2, 'b'],
       [3, 'c']], dtype=object)

`NumPy` arrays have one `dtype` for the entire array, while `DataFrames` allow one `dtype` per column. When `to_numpy()` is called on a `DataFrame`, `Pandas` will find the `NumPy` `dtype` that can hold all of the `dtypes` in the `DataFrame`. Given a dataset of different `dtypes`, this conversion results in `Numpy` interpreting the entire set as an `object`, which can be expensive (time- and memory-consuming) to navigate.

In [44]:
df.describe()

Unnamed: 0,a
count,3.0
mean,2.0
std,1.0
min,1.0
25%,1.5
50%,2.0
75%,2.5
max,3.0


In [45]:
df.T

Unnamed: 0,0,1,2
a,1,2,3
b,a,b,c


In [46]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,b,a
0,a,1
1,b,2
2,c,3


In [47]:
df.sort_values(by='b')

Unnamed: 0,a,b
0,1,a
1,2,b
2,3,c


# Pandas Accessing

In [48]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'})

## Getting

In [49]:
df['A']

2013-01-01   -0.383893
2013-01-02   -0.525907
2013-01-03    0.146090
2013-01-04   -1.920851
2013-01-05    0.340894
2013-01-06   -0.340289
Freq: D, Name: A, dtype: float64

In [50]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.383893,-0.857643,-1.766606,0.299045
2013-01-02,-0.525907,0.905896,-0.522837,-0.454788
2013-01-03,0.14609,-0.31514,0.2286,0.611255


In [51]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.525907,0.905896,-0.522837,-0.454788
2013-01-03,0.14609,-0.31514,0.2286,0.611255
2013-01-04,-1.920851,-0.248796,-0.59642,-0.20047


In [52]:
df.loc['2013-01-01']

A   -0.383893
B   -0.857643
C   -1.766606
D    0.299045
Name: 2013-01-01 00:00:00, dtype: float64

In [53]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.383893,-0.857643
2013-01-02,-0.525907,0.905896
2013-01-03,0.14609,-0.31514
2013-01-04,-1.920851,-0.248796
2013-01-05,0.340894,1.680748
2013-01-06,-0.340289,-1.034107


In [54]:
df.loc['20130102': '20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.525907,0.905896
2013-01-03,0.14609,-0.31514
2013-01-04,-1.920851,-0.248796


The data type of the returned object is automatically changed based on the dimension of the object.

In [55]:
df.loc['20130102', ['A', 'B']] # returns a series

A   -0.525907
B    0.905896
Name: 2013-01-02 00:00:00, dtype: float64

In [56]:
df.loc[dates[0], 'A'] # returns a scalar value

-0.38389263847300303

## Selection by `dtype`

In [57]:
df = pd.DataFrame({
  'string': list('abc'),
  'int64': list(range(1, 4)),
  'uint8': np.arange(3, 6).astype('u1'),
  'float64': np.arange(4.0, 7.0),
  'bool1': [True, False, True],
  'bool2': [False, True, False],
  'dates': pd.date_range('now', periods=3),
  'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-09-03 21:59:00.828473,A
1,b,2,4,5.0,False,True,2023-09-04 21:59:00.828473,B
2,c,3,5,6.0,True,False,2023-09-05 21:59:00.828473,C


In [58]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


## Boolean indexing

In [59]:
df2[df2['A'] >= 0]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [60]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']

In [61]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
1,b,2,4,5.0,False,True,2023-09-04 21:59:00.828473,B,two


In [62]:
# setting values by position
df.iat[0, 1] = -1
df.iloc[0, 1] = 2

# setting values by label
df.at['2013-01-01', 'A'] = -10
df.loc['2013-01-02', 'B'] = -20

# setting by assigning with a `NumPy` array
df.loc[:, 'C'] = np.array([50] * len(df))

In [63]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,A,B,C
0,a,2.0,3.0,4.0,True,False,2023-09-03 21:59:00.828473,A,,,50
1,b,2.0,4.0,5.0,False,True,2023-09-04 21:59:00.828473,B,,,50
2,c,3.0,5.0,6.0,True,False,2023-09-05 21:59:00.828473,C,,,50
2013-01-01,,,,,,,NaT,,-10.0,,50
2013-01-02,,,,,,,NaT,,,-20.0,50


# Pandas `merge` and `groupby`

## `merge`

`pandas` provides various facilities for easily combining `Series` and `DataFrame` objects with various kinds of set logic for the indexes and relational algebra functionality in the case of `join`/`merge`-type operations.

In [67]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.865549,-0.167666,-0.227229,0.944009
1,-0.770243,0.614083,-0.565157,0.632182
2,-1.596902,-0.219942,-0.741959,2.195849
3,-1.09819,-0.15905,-0.507532,0.402843
4,0.001309,-2.446923,-0.119008,-1.82379
5,-0.117638,1.322125,2.22632,0.521135
6,1.116767,-0.273959,-0.475554,0.09395
7,-0.062335,-0.411201,-0.596139,-0.150893
8,0.199126,-0.786165,-1.024108,0.910637
9,0.281302,-1.222595,-1.677405,-0.223177


In [68]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.865549,-0.167666,-0.227229,0.944009
1,-0.770243,0.614083,-0.565157,0.632182
2,-1.596902,-0.219942,-0.741959,2.195849
3,-1.09819,-0.15905,-0.507532,0.402843
4,0.001309,-2.446923,-0.119008,-1.82379
5,-0.117638,1.322125,2.22632,0.521135
6,1.116767,-0.273959,-0.475554,0.09395
7,-0.062335,-0.411201,-0.596139,-0.150893
8,0.199126,-0.786165,-1.024108,0.910637
9,0.281302,-1.222595,-1.677405,-0.223177


In [71]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

# make an inner join between tables created above on column 'key'
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [72]:
# make an outer join between tables created above on column 'key'
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## `groupby`
This process involves the following steps:
- splitting the data into groups based on some criteria
- applying a function to each group independently
- combining the results into a data structure

In [73]:
df = pd.DataFrame({
  'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
  'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
  'C': np.random.randn(8),
  'D': np.random.randn(8)
})
df

Unnamed: 0,A,B,C,D
0,foo,one,0.691079,0.036075
1,bar,one,-0.372913,1.088279
2,foo,two,1.785401,0.765463
3,bar,three,0.242244,1.452041
4,foo,two,0.863446,1.393309
5,bar,two,-1.420759,-0.509144
6,foo,one,-0.599213,1.474043
7,foo,three,1.372685,-0.419184


In [74]:
df.groupby('A').sum()

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,onethreetwo,-1.551428,2.031176
foo,onetwotwoonethree,4.113398,3.249706


In [75]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.372913,1.088279
bar,three,0.242244,1.452041
bar,two,-1.420759,-0.509144
foo,one,0.091865,1.510117
foo,three,1.372685,-0.419184
foo,two,2.648847,2.158772


In [76]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.551428,1.452041
foo,4.113398,1.474043
