In [1]:
import pandas as pd

# Data I/O
* csv
* json
* hdf
* html
* many more

Now, let's assume we have already gotten our dataset loaded into pandas.
Next job is to do meaningful analysis on the data.

# inspecting data

In [71]:
df = pd.read_csv('/home/dmanik/amazonianBirds_climate.csv', parse_dates={'datetime':[1,2]}, error_bad_lines=False)
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
0,2011-02-24 05:55,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
1,2011-02-24 06:05,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2,2011-09-03 18:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
3,2011-09-04 06:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
4,2011-09-04 06:05,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical


In [72]:
df.shape

(8588, 7)

In [73]:
df.columns

Index(['datetime', 'recordist', 'location', 'longitude', 'latitude',
       'elevation', 'climate'],
      dtype='object')

In [74]:
df.sample(n = 4)

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
7531,2011-10-11 07:30,GABRIEL LEITE,"Porto Velho, Rondônia",-64.4897,-9.2485,100,tropical
8033,2010-09-04 08:00,Dennis Murphy,"Vale das Taquaras, RJ",-42.4653,-22.4062,?,tropical
3261,2006-10-03 11:15,Leonardo PImentel,"Reserva Ecológica de Guapiaçu, RJ",-42.7167,-22.5001,?,tropical
4500,2004-10-11 9:00,Mauricio Cabral Periquito,"Engenho Cachoeira Linda, Barreiros, Pernambuco",-35.4773,-8.8173,80,tropical


In [75]:
df.mean()

longitude   -48.545462
latitude    -17.311786
dtype: float64

# Selection

In [222]:
df = pd.read_csv('/home/dmanik/amazonianBirds_climate.csv', parse_dates={'datetime':[1,2]}, na_values = ['?'], error_bad_lines=False)
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
0,2011-02-24 05:55,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
1,2011-02-24 06:05,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2,2011-09-03 18:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
3,2011-09-04 06:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
4,2011-09-04 06:05,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical


## Getting rows by row number

In [223]:
df.iloc[0:4]

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
0,2011-02-24 05:55,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
1,2011-02-24 06:05,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2,2011-09-03 18:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
3,2011-09-04 06:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical


**Note**: This syntax also works, but see caveats below. 
```python
df[0:4]```

## Sorting data by date

In [224]:
df = df.sort_values('datetime')
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
2603,0000-00-00 06:28,Nargila Moura,"Paragominas, PA, Brazil, Bacia 274",-47.3714,-3.2348,,tropical
865,0000-00-00 07:00,Marcos Melo,"APA Capivari Monos, São Paulo, SP",-46.6503,-23.9556,760.0,subtropical
2281,0000-00-00 09:30,Vitor Herdy,"Patrocínio, Minas Gerais State",-46.9925,-18.9439,950.0,tropical
713,0000-00-00 09:30,Marcos Melo,"Parque Municipal Nove de Julho, São Paulo, SP",-46.717,-23.7192,760.0,subtropical
1291,0000-00-00 09:30,Vitor Herdy,"Patrocínio, Minas Gerais State",-46.9925,-18.9439,950.0,tropical


## Filtering out bad data (Boolean indexing)
*we saw it already in the morning*

In [225]:
df = df[df['datetime']>='1970-01-01']

In [226]:
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
623,1990-07-01 11:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical
1329,1990-12-00 8am,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
2033,1991-01-25 8:30am,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
239,1991-03-00 7pm,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
2924,1991-06-00 11am,Antonio Silveira,"Estação Ecológica Juréia-Itatins, Peruíbe,São ...",-47.225,-24.554,10,subtropical


In [216]:
%%latex
\[
\texttt{df = df[}\underbrace{\texttt{df['datetime']>='1970-01-01'}}_{\texttt{Boolean array}}]
\]

<IPython.core.display.Latex object>

### Quite complex filetering is also possible

In [233]:
df[(df['datetime']>'2012-07-01') & (df['datetime']<'2012-08-01')].head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
2028,2012-07-01 16:30,Noé Eiterer,"Viçosa, Minas Gerais",-42.8882,-20.79,800,tropical
5459,2012-07-02 06:30,pedroteia,Serra Grande-Pão de Açúcar-alagoas-Brasil,-37.412,-9.661,400,tropical
7304,2012-07-04 07:06,Joao Menezes,"Eldorado, Mato Grosso do Sul state",-54.2537,-23.8501,340,subtropical
7315,2012-07-04 08:00,Alexandre Bianco,Rio Grande do Sul,-51.7304,-28.8638,480,subtropical
7783,2012-07-06 08:00,Noé Eiterer,"Viçosa, Minas Gerais",-42.8882,-20.79,800,tropical


### Transforming data

In [236]:
df['elevation'].astype(float)

ValueError: could not convert string to float: 'about 30'

In [238]:
def coerce_float(x):
    try:
        return float(x)
    except ValueError:
        return None

In [241]:
df['elevation'] = df['elevation'].apply(coerce_float)

## Reindex by datetime

In [130]:
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

In [133]:
df = df[pd.notnull(df.datetime)]
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
623,1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical
2033,1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
1107,1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical
4996,1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical
8152,1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical


In [134]:
df = df.set_index('datetime')

In [135]:
df.head()

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical
1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical
1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical
1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical


## Selecting by **index label**

In [160]:
df.loc['1990-07-01':'1992-12-01']

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical
1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical
1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical
1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical


This is equivalent to

In [158]:
df['1990-07-01':'1992-12-01']

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical
1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical
1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical
1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical
1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical


**WARNING:** The `.loc` syntax is infinitely preferable for selecting by label, as then you avoid pitfalls like:

In [154]:
td = pd.DataFrame(np.random.randint(10, size = (8,4)), index = range(3,11), columns=['A', 'B', 'C', 'D'])

In [155]:
td

Unnamed: 0,A,B,C,D
3,4,2,7,0
4,6,5,4,7
5,1,7,7,0
6,2,4,4,2
7,5,3,2,0
8,8,4,6,1
9,6,0,5,3
10,0,0,1,8


In [156]:
td[0:4] # == td.iloc[0:4]

Unnamed: 0,A,B,C,D
3,4,2,7,0
4,6,5,4,7
5,1,7,7,0
6,2,4,4,2


This does not return rows with index label between 0 and 4, however `.loc` does

In [157]:
td.loc[0:4]

Unnamed: 0,A,B,C,D
3,4,2,7,0
4,6,5,4,7


Why: the slicing operator `[:]` tries `iloc` first, then falls back to `loc`

### Slicing with increments

In [178]:
td

Unnamed: 0,A,B,C,D
3,4,2,7,0
4,6,5,4,7
5,1,7,7,0
6,2,4,4,2
7,5,3,2,0
8,8,4,6,1
9,6,0,5,3
10,0,0,1,8


In [179]:
td.iloc[:5:2, :]

Unnamed: 0,A,B,C,D
3,4,2,7,0
5,1,7,7,0
7,5,3,2,0


In [194]:
td.iloc[::-1, :]

Unnamed: 0,A,B,C,D
10,0,0,1,8
9,6,0,5,3
8,8,4,6,1
7,5,3,2,0
6,2,4,4,2
5,1,7,7,0
4,6,5,4,7
3,4,2,7,0


Weird behaviour:

In [196]:
td.iloc[:1:-1, :]

Unnamed: 0,A,B,C,D
10,0,0,1,8
9,6,0,5,3
8,8,4,6,1
7,5,3,2,0
6,2,4,4,2
5,1,7,7,0


In [197]:
td.loc[3:7:2, :]

Unnamed: 0,A,B,C,D
3,4,2,7,0
5,1,7,7,0
7,5,3,2,0


**WARNING:** Unlike Python's array indexing, `df.loc` and `df.iloc` *includes the endpoints* of the slices

## Benefits of indexing

In [334]:
df = pd.read_csv('/home/dmanik/amazonianBirds_climate.csv', parse_dates={'datetime':[1,2]}, error_bad_lines=False)
df.head()

Unnamed: 0,datetime,recordist,location,longitude,latitude,elevation,climate
0,2011-02-24 05:55,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
1,2011-02-24 06:05,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2,2011-09-03 18:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
3,2011-09-04 06:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
4,2011-09-04 06:05,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical


In [353]:
%time df[(df['datetime'] > '2011-02-24') & (df['datetime'] < '2013-04-28') ].count()

CPU times: user 7.03 ms, sys: 1.5 ms, total: 8.53 ms
Wall time: 6.74 ms


datetime     1935
recordist    1935
location     1935
longitude    1935
latitude     1935
elevation    1935
climate      1935
dtype: int64

In [338]:
idf = df.copy()
idf['datetime'] = pd.to_datetime(idf['datetime'], errors='coerce')
idf = idf[pd.notnull(df.datetime)]
idf = idf.set_index('datetime')
idf.head()

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-02-24 05:55:00,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2011-02-24 06:05:00,Daniel Lane,"10 km S Pocone on Transpantaneira, Mato Grosso",-56.648,-16.362,115,tropical
2011-09-03 18:00:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
2011-09-04 06:00:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical
2011-09-04 06:05:00,Eric DeFonso,"Pantanal Wildlife Center, MT",-56.8764,-16.7581,110,tropical


In [354]:
%time idf['2011-02-24':'2013-04-28']['latitude'].count()

CPU times: user 4.08 ms, sys: 1.16 ms, total: 5.24 ms
Wall time: 3.9 ms


1872

## Selecting by columns

In [199]:
df.loc[:, ['recordist', 'latitude', 'longitude']].head()

Unnamed: 0_level_0,recordist,latitude,longitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1990-07-01 11:00:00,Antonio Silveira,-21.05,-56.563
1991-01-25 08:30:00,Antonio Silveira,-23.786,-46.871
1991-10-01 10:00:00,Antonio Silveira,-23.3334,-44.8834
1992-01-12 11:00:00,Antonio Silveira,-23.357,-44.834
1992-12-01 11:00:00,Antonio Silveira,-22.365,-44.742


# Transforming data
## Adding a column

In [204]:
df.loc[:, 'year'] = df.index.year
df.head()

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical,1990
1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical,1991
1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical,1991
1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical,1992
1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical,1992


## Applying transformations on a column, revisited

In [206]:
df.loc[:, 'year'] = df.loc[:, 'year'] - df.loc[:, 'year'].min()
df.head()

Unnamed: 0_level_0,recordist,location,longitude,latitude,elevation,climate,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-07-01 11:00:00,Antonio Silveira,"Bonito,Mato Grosso do Sul State",-56.563,-21.05,500,tropical,0
1991-01-25 08:30:00,Antonio Silveira,"Itapecerica da Serra,SP, Brazil",-46.871,-23.786,900,subtropical,1
1991-10-01 10:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.8834,-23.3334,5,tropical,1
1992-01-12 11:00:00,Antonio Silveira,Serra do Mar State Park. Picinguaba,-44.834,-23.357,10,tropical,2
1992-12-01 11:00:00,Antonio Silveira,"Highlands of Itatiaia National Park,RJ,Brazil",-44.742,-22.365,2000,tropical,2


## Append

In [245]:
 df1 = pd.DataFrame(np.random.randint(5, size=(4,6)), columns=list('ABCDEF'))
 df2 = pd.DataFrame(np.random.randint(5, size=(4,6)), columns=list('ABCDEF'))

In [248]:
df1

Unnamed: 0,A,B,C,D,E,F
0,0,4,3,4,2,4
1,1,4,0,0,2,0
2,0,4,4,4,4,4
3,2,0,2,2,0,0


In [249]:
df2

Unnamed: 0,A,B,C,D,E,F
0,4,4,2,4,2,1
1,3,3,0,3,0,1
2,4,1,3,4,3,2
3,1,1,2,4,1,1


In [252]:
df3 = df1.append(df2)
df3

Unnamed: 0,A,B,C,D,E,F
0,0,4,3,4,2,4
1,1,4,0,0,2,0
2,0,4,4,4,4,4
3,2,0,2,2,0,0
0,4,4,2,4,2,1
1,3,3,0,3,0,1
2,4,1,3,4,3,2
3,1,1,2,4,1,1


Now df3 has nonsensical index:

In [255]:
df3.loc[2:3]

KeyError: 'Cannot get left slice bound for non-unique label: 2'

In [259]:
df3

Unnamed: 0,A,B,C,D,E,F
0,0,4,3,4,2,4
1,1,4,0,0,2,0
2,0,4,4,4,4,4
3,2,0,2,2,0,0
0,4,4,2,4,2,1
1,3,3,0,3,0,1
2,4,1,3,4,3,2
3,1,1,2,4,1,1


In [260]:
df3.index = range(len(df3))

## Concatenation

In [261]:
df1 = pd.DataFrame(np.random.randint(5, size=(4,6)), columns=list('ABCDEF'))
df2 = pd.DataFrame(np.random.randint(5, size=(4,6)), columns=list('ABCDEF'))
df3 = pd.DataFrame(np.random.randint(5, size=(4,6)), columns=list('ABCDEF'))
pd.concat([df1, df2, df3])

Unnamed: 0,A,B,C,D,E,F
0,0,4,1,1,3,2
1,1,3,2,2,2,3
2,3,1,4,2,0,3
3,1,1,2,3,3,4
0,4,1,0,3,0,0
1,4,1,2,2,0,3
2,3,3,3,4,1,3
3,2,1,3,0,3,4
0,2,2,1,1,0,3
1,2,1,2,4,1,2


## Join
### Combining two tables with some common column(s)

In [274]:
from urllib import request
from bs4 import BeautifulSoup
import io

In [279]:
fossil_fuel_url = 'http://www.worldatlas.com/articles/countries-the-most-dependent-on-fossil-fuels.html'
response = request.urlopen(fossil_fuel_url)
data = response.read()
ff_st = data.decode('utf-8')
soup = BeautifulSoup(ff_st, "lxml")
fossil = str(soup.find_all('table', attrs={'data-role':"table"})[0])
fdf = pd.read_html(io.StringIO(fossil))[0]
fdf.head()

Unnamed: 0,Rank,Country Name,Percentage of Energy Use Dependent on Fossil Fuels
0,1,Oman,100.00 %
1,2,Qatar,100.00 %
2,3,Kuwait,100.00 %
3,4,Saudi Arabia,100.00 %
4,5,Brunei Darussalam,100.00 %


In [284]:
renewable_url = 'https://en.wikipedia.org/wiki/List_of_countries_by_electricity_production_from_renewable_sources'
response = request.urlopen(renewable_url)
data = response.read()
ff_st = data.decode('utf-8')
soup = BeautifulSoup(ff_st, "lxml")
renew = str(soup.find_all('table', class_='wikitable sortable')[1])
rdf = pd.read_html(io.StringIO(renew), header=0)[0]
rdf.head()

Unnamed: 0,Country,Year,Total renewable (GWh),Hydropower (GWh),Wind power (GWh),Biomass (GWh),Solar power (GWh),Geothermal (GWh),% of total generation,Ref and notes
0,Afghanistan,2012,710,710,-,-,-,-,80.32%,
1,Albania,2012,4245,4245,-,-,-,-,99.98%,
2,Algeria,2012,616,616,-,-,-,-,1.14%,
3,Angola,2012,3940,3940,-,-,-,-,71.96%,
4,Argentina,2015,42072,41464,608,-,608,-,31.1%,608 GWh mentioned is the total amount from win...


In [315]:
df1 = fdf.copy()
df2 = rdf.copy()

In [316]:
df3 = pd.merge(df1, df2, left_on = 'Country Name', right_on='Country')
df3.head()

Unnamed: 0,Rank,Country Name,Percentage of Energy Use Dependent on Fossil Fuels,Country,Year,Total renewable (GWh),Hydropower (GWh),Wind power (GWh),Biomass (GWh),Solar power (GWh),Geothermal (GWh),% of total generation,Ref and notes
0,6,Trinidad and Tobago,99.93 %,Trinidad and Tobago,2012,20,-,-,20,-,-,0.23%,
1,9,Algeria,99.86 %,Algeria,2012,616,616,-,-,-,-,1.14%,
2,12,Kazakhstan,98.89 %,Kazakhstan,2012,7564,7561,3,-,-,-,8.78%,
3,13,Malta,98.56 %,Malta,2012,16,-,-,3,13,-,0.75%,
4,15,Azerbaijan,98.48 %,Azerbaijan,2012,1803,1803,-,-,-,-,8.31%,


#### By default, join uses the intersection of the join keys

In [317]:
len(set(df1['Country Name'])), len(set(df2['Country'])), len(set(df1['Country Name'])&set(df2['Country']))

(50, 154, 32)

In [318]:
df3.shape

(32, 13)

#### To have the union of the keys, use `how=outer` option:

In [319]:
df4 = pd.merge(df1, df2, left_on = 'Country Name', right_on='Country', how = 'outer')
df4.head()

Unnamed: 0,Rank,Country Name,Percentage of Energy Use Dependent on Fossil Fuels,Country,Year,Total renewable (GWh),Hydropower (GWh),Wind power (GWh),Biomass (GWh),Solar power (GWh),Geothermal (GWh),% of total generation,Ref and notes
0,1.0,Oman,100.00 %,,,,,,,,,,
1,2.0,Qatar,100.00 %,,,,,,,,,,
2,3.0,Kuwait,100.00 %,,,,,,,,,,
3,4.0,Saudi Arabia,100.00 %,,,,,,,,,,
4,5.0,Brunei Darussalam,100.00 %,,,,,,,,,,


All countries in the `df1` and `df2` are present, with rows that are not on both tables filled with `NaN` values.

In [320]:
len(set(df1['Country Name'])), len(set(df2['Country'])), len(set(df1['Country Name'])|set(df2['Country']))

(50, 154, 172)

In [321]:
df4.shape

(172, 13)

#### It is also possible to have *only* the keys in the left table (or right)

In [322]:
df5 = pd.merge(df1, df2, left_on = 'Country Name', right_on='Country', how = 'left')
df5.head()

Unnamed: 0,Rank,Country Name,Percentage of Energy Use Dependent on Fossil Fuels,Country,Year,Total renewable (GWh),Hydropower (GWh),Wind power (GWh),Biomass (GWh),Solar power (GWh),Geothermal (GWh),% of total generation,Ref and notes
0,1,Oman,100.00 %,,,,,,,,,,
1,2,Qatar,100.00 %,,,,,,,,,,
2,3,Kuwait,100.00 %,,,,,,,,,,
3,4,Saudi Arabia,100.00 %,,,,,,,,,,
4,5,Brunei Darussalam,100.00 %,,,,,,,,,,


All countries in the `df1` columns are present, even if they are not in `df2`; but not the other way round.

### Join (AKA, merging on index)

In [323]:
idf1 = df1.set_index('Country Name')

In [324]:
idf2 = df2.set_index('Country')

In [325]:
idf1.join(idf2)

Unnamed: 0_level_0,Rank,Percentage of Energy Use Dependent on Fossil Fuels,Year,Total renewable (GWh),Hydropower (GWh),Wind power (GWh),Biomass (GWh),Solar power (GWh),Geothermal (GWh),% of total generation,Ref and notes
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Oman,1,100.00 %,,,,,,,,,
Qatar,2,100.00 %,,,,,,,,,
Kuwait,3,100.00 %,,,,,,,,,
Saudi Arabia,4,100.00 %,,,,,,,,,
Brunei Darussalam,5,100.00 %,,,,,,,,,
Trinidad and Tobago,6,99.93 %,2012.0,20.0,-,-,20,-,-,0.23%,
Bahrain,7,99.92 %,,,,,,,,,
United Arab Emirates,8,99.91 %,,,,,,,,,
Algeria,9,99.86 %,2012.0,616.0,616,-,-,-,-,1.14%,
"Iran, Islamic Rep.",10,99.33 %,,,,,,,,,


**Warning:** One major difference between `marge` and `join`: by default (i.e. without `how=<kind>` specified, `merge` does **inner join**, but `join` does **left outer join**.

**Note:** Indexing is better for performing searches.

In [330]:
%timeit -n 1 idf1.join(idf2)

1 loop, best of 3: 1.21 ms per loop


In [331]:
%timeit -n 1 pd.merge(df1, df2, left_on = 'Country Name', right_on='Country', how = 'left')

1 loop, best of 3: 2.27 ms per loop


This is of course, also true outside on merge

In [333]:
df2

(154, 10)

# Aggregation

In [383]:
population_url = 'https://en.wikipedia.org/wiki/List_of_cities_proper_by_population'
response = request.urlopen(population_url)
data = response.read()
pop_st = data.decode('utf-8')
soup = BeautifulSoup(pop_st, "lxml")
pops = str(soup.find_all('table', class_='sortable wikitable')[0])
popdf= pd.read_html(io.StringIO(pops), header=0)[0]
popdf.head()

Unnamed: 0,Rank,City,Image,Population,Definition,Total area (km²),Population density (/km²),Country
0,1,Shanghai,,"7007242568000000000♠24,256,800[6]",Municipality,"7003634050000000000♠6,340.5[7]",3826,China
1,2,Karachi,,"7007235000000000000♠23,500,000[8]",Metropolitan Corporation[9],"7003352700000000000♠3,527",6663,Pakistan
2,3,Beijing,,"7007215160000000000♠21,516,000[10]",Municipality,"7004164105400000000♠16,410.54[11]",1311,China
3,4,Delhi,,"7007167879410000000♠16,787,941[12]",Union territory,"7003148400000000000♠1,484",11313,India
4,5,Lagos,,"7007160603030000000♠16,060,303[a]",Metropolitan City,"7003117128000000000♠1,171.28[14]",13712,Nigeria


In [384]:
popdf = popdf.loc[:, ['City', 'Population density (/km²)', 'Country']]

In [387]:
popdf.rename(columns={'Population density (/km²)': 'population density'}, inplace=True)

## How to obtain number of cities per country with population density higher than 3000

In [426]:
x = popdf[popdf['population density']>3000]

In [427]:
gr = x.groupby('Country')

In [448]:
gr.aggregate('count')

Unnamed: 0_level_0,City,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,1,1
Argentina,1,1
Bangladesh,1,1
Brazil,2,2
Chile,1,1
China,12,12
Colombia,1,1
Democratic Republic of the Congo,1,1
Egypt,2,2
Ethiopia,1,1


### Using different aggregators

In [449]:
gr.aggregate({'City':'count', 'population density':'mean'})

Unnamed: 0_level_0,City,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,1,12415.0
Argentina,1,15046.0
Bangladesh,1,56567.0
Brazil,2,6664.0
Chile,1,4595.0
China,12,5010.166667
Colombia,1,9052.0
Democratic Republic of the Congo,1,8710.0
Egypt,2,8837.5
Ethiopia,1,5889.0


`groupby.describe` gives very succinct statistical summary of your dataset

In [450]:
gr.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,count,1.000000
Afghanistan,mean,12415.000000
Afghanistan,std,
Afghanistan,min,12415.000000
Afghanistan,25%,12415.000000
Afghanistan,50%,12415.000000
Afghanistan,75%,12415.000000
Afghanistan,max,12415.000000
Argentina,count,1.000000
Argentina,mean,15046.000000


## The result of `groupby` is a DataFrame with a `MultiIndex`

In [440]:
stats = gr.describe()

In [441]:
stats.index

MultiIndex(levels=[['Afghanistan', 'Argentina', 'Bangladesh', 'Brazil', 'Chile', 'China', 'Colombia', 'Democratic Republic of the Congo', 'Egypt', 'Ethiopia', 'Germany', 'India', 'Indonesia', 'Iran', 'Japan', 'Kenya', 'Mexico', 'Morocco', 'Myanmar', 'Nigeria', 'Pakistan', 'Peru', 'Russia', 'Saudi Arabia', 'Singapore', 'South Korea', 'Spain', 'Thailand', 'United Kingdom', 'United States', 'Vietnam'], ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,

### This is a very powerful tool to store hierarchical data

In [451]:
stats

Unnamed: 0_level_0,Unnamed: 1_level_0,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,count,1.000000
Afghanistan,mean,12415.000000
Afghanistan,std,
Afghanistan,min,12415.000000
Afghanistan,25%,12415.000000
Afghanistan,50%,12415.000000
Afghanistan,75%,12415.000000
Afghanistan,max,12415.000000
Argentina,count,1.000000
Argentina,mean,15046.000000


## Selecting in a MultiIndexed dataFrame

In [453]:
stats.loc['Vietnam']

Unnamed: 0,population density
count,1.0
mean,3925.0
std,
min,3925.0
25%,3925.0
50%,3925.0
75%,3925.0
max,3925.0


In [454]:
stats.loc['Vietnam', 'mean']

population density    3925.0
Name: (Vietnam, mean), dtype: float64

### Slicing works, too

In [458]:
stats.loc['Afghanistan':'Brazil']

Unnamed: 0_level_0,Unnamed: 1_level_0,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,count,1.0
Afghanistan,mean,12415.0
Afghanistan,std,
Afghanistan,min,12415.0
Afghanistan,25%,12415.0
Afghanistan,50%,12415.0
Afghanistan,75%,12415.0
Afghanistan,max,12415.0
Argentina,count,1.0
Argentina,mean,15046.0


## Getting a "flat" DataFrame back: `reset_index()`
### Obtaining the top 10 countries by average population density in urban areas

In [477]:
x = stats.sort_values('population density', ascending=False)
x

Unnamed: 0_level_0,Unnamed: 1_level_0,population density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Bangladesh,max,56567.000000
Bangladesh,75%,56567.000000
Bangladesh,50%,56567.000000
Bangladesh,25%,56567.000000
Bangladesh,min,56567.000000
Bangladesh,mean,56567.000000
Pakistan,max,30083.000000
Pakistan,75%,29339.750000
Pakistan,50%,26623.000000
India,max,24306.000000


In [478]:
y = x.reset_index()
y.head()

Unnamed: 0,Country,level_1,population density
0,Bangladesh,max,56567.0
1,Bangladesh,75%,56567.0
2,Bangladesh,50%,56567.0
3,Bangladesh,25%,56567.0
4,Bangladesh,min,56567.0


In [480]:
y[y['level_1']=='mean'][0:10]

Unnamed: 0,Country,level_1,population density
5,Bangladesh,mean,56567.0
10,Pakistan,mean,20928.666667
16,Iran,mean,18083.0
23,Indonesia,mean,15171.0
27,Argentina,mean,15046.0
33,Nigeria,mean,13712.0
39,India,mean,13342.3
43,Afghanistan,mean,12415.0
52,South Korea,mean,10910.0
59,Colombia,mean,9052.0
