# Intermediate Python: Dictionaries and Pandas

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Motivation For Dictionaries

In [2]:
# Definition of countries and capital
countries = ['spain', 'france', 'germany', 'norway']
capitals = ['madrid', 'paris', 'berlin', 'oslo']

# Get index of 'germany': ind_ger
ind_ger = countries.index('germany')

# Use ind_ger to print out capital of Germany
print(capitals[ind_ger])

berlin


## Create Dictonary

In [3]:
# Definition of countries and capital
countries = ['spain', 'france', 'germany', 'norway']
capitals = ['madrid', 'paris', 'berlin', 'oslo']

# From string in countries and capitals, create dictionary europe
europe = { 'spain':'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo' }

# Print europe
print(europe)

{'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo'}


## Access Dictonaries

In [4]:
# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin', 'norway':'oslo' }

# Print out the keys in europe
print(europe.keys())

# Print out value that belongs to key 'norway'
print(europe['norway'])

dict_keys(['spain', 'france', 'germany', 'norway'])
oslo


## Dictionary Manipulation (1)

In [5]:
# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin', 'norway':'oslo' }

# Add italy to europe
europe['italy'] = 'rome'

# Print out italy in europe
print('italy' in europe)

# Add poland to europe
europe['poland'] = 'warsaw'

# Print europe
print(europe)

True
{'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo', 'italy': 'rome', 'poland': 'warsaw'}


## Dictionary Manipulation (2)

In [6]:
# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'bonn',
          'norway':'oslo', 'italy':'rome', 'poland':'warsaw',
          'australia':'vienna' }

# Update capital of germany
europe['germany'] = 'berlin'

# Remove australia
del(europe['australia'])

# Print europe
print(europe)

{'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo', 'italy': 'rome', 'poland': 'warsaw'}


## Dictionariception

In [7]:
# Dictionary of dictionaries
europe = { 'spain': { 'capital':'madrid', 'population':46.77 },
           'france': { 'capital':'paris', 'population':66.03 },
           'germany': { 'capital':'berlin', 'population':80.62 },
           'norway': { 'capital':'oslo', 'population':5.084 } }


# Print out the capital of France
print(europe['france']['capital'])

# Create sub-dictionary data
data = {'capital': 'rome', 'population': 59.83} 

# Add data to europe under key 'italy'
europe['italy'] = data


# Print europe
print(europe)

paris
{'spain': {'capital': 'madrid', 'population': 46.77}, 'france': {'capital': 'paris', 'population': 66.03}, 'germany': {'capital': 'berlin', 'population': 80.62}, 'norway': {'capital': 'oslo', 'population': 5.084}, 'italy': {'capital': 'rome', 'population': 59.83}}


## Dictionary to DataFrame (1)

Pandas is an open source library, providing high-performance, easy-to-use data structures and data analysis tools for Python. Sounds promising!

The DataFrame is one of Pandas' most important data structures. It's basically a way to store tabular data where you can label the rows and the columns. One way to build a DataFrame is from a dictionary.

In the exercises that follow you will be working with vehicle data from different countries. Each observation corresponds to a country and the columns give information about the number of vehicles per capita, whether people drive left or right, and so on.

Three lists are defined in the script:

names, containing the country names for which data is available.
dr, a list with booleans that tells whether people drive left or right in the corresponding country.
cpc, the number of motor vehicles per 1000 people in the corresponding country.
Each dictionary key is a column label and each value is a list which contains the column elements.

In [8]:
# Pre-defined lists
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]

# Import pandas as pd
import pandas as pd

# Create dictionary my_dict with three key:value pairs: my_dict
my_dict = {
    'country': names,
    'drives_right': dr,
    'cars_per_cap': cpc
}

# Build a DataFrame cars from my_dict: cars
cars = pd.DataFrame(my_dict)

# Print cars
print(cars)

         country  drives_right  cars_per_cap
0  United States          True           809
1      Australia         False           731
2          Japan         False           588
3          India         False            18
4         Russia          True           200
5        Morocco          True            70
6          Egypt          True            45


## Dictionary to DataFrame (2)

The Python code that solves the previous exercise is included on the right. Have you noticed that the row labels (i.e. the labels for the different observations) were automatically set to integers from 0 up to 6?

To solve this a list row_labels has been created. You can use it to specify the row labels of the cars DataFrame. You do this by setting the index attribute of cars, that you can access as cars.index.

In [9]:
import pandas as pd

# Build cars DataFrame
names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
cars_dict = { 'country':names, 'drives_right':dr, 'cars_per_cap':cpc }
cars = pd.DataFrame(cars_dict)
print(cars)

# Definition of row_labels
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

# Specify row labels of cars
cars.index = row_labels

# Print cars again
print(cars)

         country  drives_right  cars_per_cap
0  United States          True           809
1      Australia         False           731
2          Japan         False           588
3          India         False            18
4         Russia          True           200
5        Morocco          True            70
6          Egypt          True            45
           country  drives_right  cars_per_cap
US   United States          True           809
AUS      Australia         False           731
JPN          Japan         False           588
IN           India         False            18
RU          Russia          True           200
MOR        Morocco          True            70
EG           Egypt          True            45


## CSV to DataFrame (1)

In [0]:
# # Import pandas as pd
# import pandas as pd

# # Import the cars.csv data: cars
# cars = pd.read_csv('cars.csv')

# # Print out cars
# print(cars)

## CSV to DataFrame (2)

In [0]:
# # Import pandas as pd
# import pandas as pd

# # Fix import by including index_col
# cars = pd.read_csv('cars.csv', index_col=0)

# # Print out cars
# print(cars)

## Square Brackets (1)

In [0]:
# # Import cars data
# import pandas as pd
# cars = pd.read_csv('cars.csv', index_col = 0)

# # Print out country column as Pandas Series
# print(cars['country'])

# # Print out country column as Pandas DataFrame
# print(cars[['country']])

# # Print out DataFrame with country and drives_right columns
# print(cars[['country', 'drives_right']])


## Square Brackets (2)

In [0]:
# # Import cars data
# import pandas as pd
# cars = pd.read_csv('cars.csv', index_col = 0)

# # Print out first 3 observations
# print(cars[:3])

# # Print out fourth, fifth and sixth observation
# print(cars[3:6])

## loc and iloc (1)

With loc and iloc you can do practically any data selection operation on DataFrames you can think of. loc is label-based, which means that you have to specify rows and columns based on their row and column labels. iloc is integer index based, so you have to specify rows and columns by their integer index like you did in the previous exercise.

Try out the following commands in the IPython Shell to experiment with loc and iloc to select observations. Each pair of commands here gives the same result.

cars.loc['RU']
cars.iloc[4]

cars.loc[['RU']]
cars.iloc[[4]]

cars.loc[['RU', 'AUS']]
cars.iloc[[4, 1]]
As before, code is included that imports the cars data as a Pandas DataFrame.

In [0]:
# # Import cars data
# import pandas as pd
# cars = pd.read_csv('cars.csv', index_col = 0)

# # Print out observation for Japan
# print(cars.loc['JPN'])

# # Print out observations for Australia and Egypt
# print(cars.loc[['AUS', 'EG']])

## loc and iloc (2)

In [0]:
# # Import cars data
# import pandas as pd
# cars = pd.read_csv('cars.csv', index_col = 0)

# # Print out drives_right value of Morocco
# print(cars.loc['MOR', 'drives_right'])

# # Print sub-DataFrame
# print(cars.loc[['RU', 'MOR'], ['country', 'drives_right']])

## loc and iloc (3)

In [0]:
# # Import cars data
# import pandas as pd
# cars = pd.read_csv('cars.csv', index_col = 0)

# # Print out drives_right column as Series
# print(cars.loc[:, 'drives_right'])

# # Print out drives_right column as DataFrame
# print(cars.loc[:, ['drives_right']])

# # Print out cars_per_cap and drives_right as DataFrame
# print(cars.loc[:, ['cars_per_cap', 'drives_right']])