# Introduction to pandas
- A Python library where data is presented in a tabular format
- Contains data structure and data manipulation tools
- Efficient is handling large sized data. Large?
- Efficient tool for data cleaning and data analysis

In [1]:
import pandas as pd
import numpy as np

## Series (not a dataframe) in pandas
- This is not a 2D data structure. In fact, a Series is a 1D object which is very similar to a numpy array.
- What is the difference between an array and a Series?
  - in an array, data with index but in the case of a Series, data with labeled index
- A Series is similar to a single column of a table
- Data are stored in a Series according to key-value pairs.

In [2]:
data = pd.Series()
type(data)

  data = pd.Series()


pandas.core.series.Series

In [3]:
lakes = ['Moraine Lake', 'Lake Louise', 'Annette Lake', 'Maligne Lake']
lakesSeries = pd.Series(lakes)
print(lakesSeries)

0    Moraine Lake
1     Lake Louise
2    Annette Lake
3    Maligne Lake
dtype: object


In [4]:
type(lakesSeries)

pandas.core.series.Series

In [5]:
lakes = ['Moraine Lake', 'Lake Louise', 'Annette Lake']
lakesSeries = pd.Series(lakes, index=['M', 'L', 'A'])
print(lakesSeries)

M    Moraine Lake
L     Lake Louise
A    Annette Lake
dtype: object


In [9]:
lakes = {'M': 'Moraine Lake', 'L': 'Lake Louise', 'A': 'Annette Lake', 'P':'Peyto'}
lakesSeries = pd.Series(lakes)
print(lakesSeries)

M    Moraine Lake
L     Lake Louise
A    Annette Lake
P           Peyto
dtype: object


In [10]:
# you can access the Series element by labeled index (key)
lakesSeries['M']

'Moraine Lake'

In [11]:
# you can access the Series element by index
lakesSeries[0]

'Moraine Lake'

In [12]:
# the concept of slicing
lakesSeries[0:3]

M    Moraine Lake
L     Lake Louise
A    Annette Lake
dtype: object

In [14]:
# negative indexing
lakesSeries[-2:-1]

A    Annette Lake
dtype: object

In [15]:
lakesSeries[0:5:2]

M    Moraine Lake
A    Annette Lake
dtype: object

In [18]:
lakesSeries[['M', 'A']]

M    Moraine Lake
A    Annette Lake
dtype: object

In [21]:
lakesSeries['M':'A']

M    Moraine Lake
L     Lake Louise
A    Annette Lake
dtype: object

#### We can perform broadcasting and arithmetic operations to a Series similar to a numpy array

In [22]:
ser = pd.Series(np.arange(10,60,10))
ser

0    10
1    20
2    30
3    40
4    50
dtype: int32

In [23]:
# broadcasting
ser = ser * 2
ser

0     20
1     40
2     60
3     80
4    100
dtype: int32

In [24]:
ser.max()

100

In [25]:
ser.mean()

60.0

In [26]:
ser.argmax()

4

## DataFrame in pandas
- 2D data structure that contains data in a tabular format
- A DataFrame is a collection of columns
- A DataFrame is a collection of Series where the keys of the Series are the column headers and the values of the Series
- are the data values

In [29]:
data = { 'Name':['Connor', 'Leon', 'Nikita', 'David'],
        'Points': [99, 82, 34, 12]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Points
0,Connor,99
1,Leon,82
2,Nikita,34
3,David,12


In [30]:
type(df)

pandas.core.frame.DataFrame

In [35]:
np.random.seed(100)
df = pd.DataFrame(data = np.random.randn(5,4), index = ['A', 'B', 'C', 'D', 'E'], columns=['One','Two','Three','Four'])
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [36]:
# transposing - swapping rows and columns
df.T

Unnamed: 0,A,B,C,D,E
One,-1.749765,0.981321,-0.189496,-0.583595,-0.53128
Two,0.34268,0.514219,0.255001,0.816847,1.029733
Three,1.153036,0.22118,-0.458027,0.672721,-0.438136
Four,-0.252436,-1.070043,0.435163,-0.104411,-1.118318


In [37]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


#### Accessing columns

In [38]:
df['One']

A   -1.749765
B    0.981321
C   -0.189496
D   -0.583595
E   -0.531280
Name: One, dtype: float64

In [39]:
type(df['One'])

pandas.core.series.Series

In [41]:
type(df)

pandas.core.frame.DataFrame

In [43]:
df[['One', 'Two']]

Unnamed: 0,One,Two
A,-1.749765,0.34268
B,0.981321,0.514219
C,-0.189496,0.255001
D,-0.583595,0.816847
E,-0.53128,1.029733


#### Accessing rows

In [44]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [46]:
df.iloc[1]

One      0.981321
Two      0.514219
Three    0.221180
Four    -1.070043
Name: B, dtype: float64

In [47]:
df.loc['B']

One      0.981321
Two      0.514219
Three    0.221180
Four    -1.070043
Name: B, dtype: float64

In [48]:
df.iloc[1:4]

Unnamed: 0,One,Two,Three,Four
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [49]:
df.loc['B':'D']

Unnamed: 0,One,Two,Three,Four
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [50]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [57]:
df.iloc[0][0]

-1.7497654730546974

In [58]:
df.loc['A'][0]

-1.7497654730546974

In [61]:
df['One'][0]

-1.7497654730546974

In [62]:
df.loc['A']['One']

-1.7497654730546974

In [63]:
df.loc[['A','D']][['One','Three']]

Unnamed: 0,One,Three
A,-1.749765,1.153036
D,-0.583595,0.672721


In [64]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [68]:
df.loc[['A', 'D']][['One', 'Three']]

Unnamed: 0,One,Three
A,-1.749765,1.153036
D,-0.583595,0.672721


In [70]:
df.iloc[[0,3]][['One', 'Three']]

Unnamed: 0,One,Three
A,-1.749765,1.153036
D,-0.583595,0.672721


In [71]:
df.loc[['A', 'D'],['One','Three']]

Unnamed: 0,One,Three
A,-1.749765,1.153036
D,-0.583595,0.672721


In [72]:
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [76]:
df.loc[['C']][['Two','Three','Four']]

Unnamed: 0,Two,Three,Four
C,0.255001,-0.458027,0.435163


In [77]:
df.loc[['C'] , ['Two', 'Three', 'Four']]

Unnamed: 0,Two,Three,Four
C,0.255001,-0.458027,0.435163


In [78]:
df.iloc[2][['Two','Three','Four']]

Two      0.255001
Three   -0.458027
Four     0.435163
Name: C, dtype: float64

In [81]:
df.iloc[2,1:4]

Two      0.255001
Three   -0.458027
Four     0.435163
Name: C, dtype: float64

In [83]:
df.iloc[2,1:4]

Two      0.255001
Three   -0.458027
Four     0.435163
Name: C, dtype: float64

In [84]:
# conditional selection
df

Unnamed: 0,One,Two,Three,Four
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [87]:
df > 0

Unnamed: 0,One,Two,Three,Four
A,False,True,True,False
B,True,True,True,False
C,False,True,False,True
D,False,True,True,False
E,False,True,False,False


In [88]:
df[df > 0]

Unnamed: 0,One,Two,Three,Four
A,,0.34268,1.153036,
B,0.981321,0.514219,0.22118,
C,,0.255001,,0.435163
D,,0.816847,0.672721,
E,,1.029733,,
