# Pandas Tutorial

Pandas Series


In [2]:
### G7 Nations. Canada, France, Germany, Italy, Japan, UK and US.
import pandas as pd
import numpy as np

# In millions
g7_pop = pd. Series([35.467, 63.951, 80.940, 60.665, 127.06, 64.511, 318.523])


In [10]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.060
5     64.511
6    318.523
dtype: float64

In [4]:
# This access the index of the series.
g7_pop[1]
g7_pop[7]

318.523

In [11]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

But, in contrast to lists, we can change the index

In [12]:
g7_pop.index = [
    "Canada",
    "France",
    "Germany",
    "Italy",
    "Japan",
    "United Kingdom",
    "United States",
]


In [13]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.060
United Kingdom     64.511
United States     318.523
dtype: float64

## Indexing

In [14]:
g7_pop["Canada"]

35.467

In [15]:
g7_pop['Japan']

127.06

Numeric position can be used with iloc attribute

In [16]:
### Find the population of France with iloc
g7_pop.iloc[1]

63.951

In [17]:
### Find the population of US with iloc
g7_pop.iloc[-1]

318.523

Selecting multiple index at once

In [19]:
g7_pop[['United Kingdom','United States']]

United Kingdom     64.511
United States     318.523
dtype: float64

Slicing also works, but important in Pandas the upper limit is also included

In [20]:
g7_pop['Canada':'Italy']

Canada     35.467
France     63.951
Germany    80.940
Italy      60.665
dtype: float64

Conditional Selection (Boolean Arrays)

In [21]:
g7_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
dtype: bool

In [22]:
g7_pop[g7_pop > 70]

Germany           80.940
Japan            127.060
United States    318.523
dtype: float64

In [23]:
g7_pop[g7_pop > g7_pop.mean()]

Japan            127.060
United States    318.523
dtype: float64

In [24]:
g7_pop.mean()

107.30242857142856

Operations on Series

In [25]:
g7_pop * 2

Canada             70.934
France            127.902
Germany           161.880
Italy             121.330
Japan             254.120
United Kingdom    129.022
United States     637.046
dtype: float64

Logic OR & AND

In [26]:
g7_pop[(g7_pop > 80) & (g7_pop < 200)]

Germany     80.94
Japan      127.06
dtype: float64

Modifying Series

In [27]:
g7_pop['Canada'] = 40.5

In [28]:
g7_pop.iloc[-1] = 500

In [29]:
g7_pop

Canada             40.500
France             63.951
Germany            80.940
Italy              60.665
Japan             127.060
United Kingdom     64.511
United States     500.000
dtype: float64

In [30]:
g7_pop[g7_pop < 70]

Canada            40.500
France            63.951
Italy             60.665
United Kingdom    64.511
dtype: float64

In [31]:
g7_pop[g7_pop < 70] = 99.99

In [32]:
g7_pop

Canada             99.99
France             99.99
Germany            80.94
Italy              99.99
Japan             127.06
United Kingdom     99.99
United States     500.00
dtype: float64

Pandas DataFrame

In [4]:
df = pd.DataFrame({
    'Population': [35.467, 63.951,80.94,60.655,127.0671,64.511,318.523],
    'GDP': [1785387, 2833687,3874437,2167744,4602367,2950039,17348075],
    'Surface Area': [9984670, 640679,357114,301336,377930,242495,9525067],
    'HDI': [0.913,0.888,0.916,0.873,0.891,0.907,0.915],
    'Continent': ['America','Europe','Europe','Europe','Asia','Europe','America']
}, columns=['Population', 'GDP', 'Surface Area', 'HDI','Continent'])

In [5]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.655,2167744,301336,0.873,Europe
4,127.0671,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [6]:
### Providing custom indexes with Dataframe
df.index = [
    "Canada",
    "France",
    "Germany",
    "Italy",
    "Japan",
    "United Kingdom",
    "United States",
]

In [7]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.655,2167744,301336,0.873,Europe
Japan,127.0671,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [8]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [9]:
df.shape

(7, 5)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [11]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [12]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302014,5080248.0,3061327.0,0.900429
std,97.250976,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.303,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.00355,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [13]:
df.size

35

In [15]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [16]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
Name: count, dtype: int64

Indexing, Selection and Slicing an DataFrame

In [19]:
### Require All Data associated with Canada
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [20]:
### Require Surface area of canada
df.loc['Canada','Surface Area']

9984670

In [21]:
df.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [23]:
df['Population']

Canada             35.4670
France             63.9510
Germany            80.9400
Italy              60.6550
Japan             127.0671
United Kingdom     64.5110
United States     318.5230
Name: Population, dtype: float64

In [24]:
df['Population'].to_frame

<bound method Series.to_frame of Canada             35.4670
France             63.9510
Germany            80.9400
Italy              60.6550
Japan             127.0671
United Kingdom     64.5110
United States     318.5230
Name: Population, dtype: float64>

In [25]:
### Multiple column selection
df[['Population','HDI']]

Unnamed: 0,Population,HDI
Canada,35.467,0.913
France,63.951,0.888
Germany,80.94,0.916
Italy,60.655,0.873
Japan,127.0671,0.891
United Kingdom,64.511,0.907
United States,318.523,0.915


In [28]:
# iloc-- To access df via numeric fashion
## Using iloc - the slicing will be similar to pandas meaning 3 will not be considered
df.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [29]:
df.iloc[1:3, [0,3]]

Unnamed: 0,Population,HDI
France,63.951,0.888
Germany,80.94,0.916


In [30]:
df.iloc[1:3,1:3]

Unnamed: 0,GDP,Surface Area
France,2833687,640679
Germany,3874437,357114


Conditional Selection (Boolean Arrays)

In [31]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.655,2167744,301336,0.873,Europe
Japan,127.0671,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [32]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [33]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.0671,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [34]:
df.loc[df['Population']>70, ['Population','GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.0671,4602367
United States,318.523,17348075


Dropping Stuff
# Similar to selection, we can do droppin

In [35]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.655,2167744,301336,0.873,Europe
Japan,127.0671,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [36]:
df.drop(['Canada','Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.655,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


Operations


In [37]:
df[['Population','GDP']] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60655,21677.44
Japan,1.270671,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [38]:
crisis = pd.Series([-1_000_000,-0.3], index=['GDP','HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [39]:
df[['GDP','HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [40]:
df[['GDP','HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


In [42]:
### In Pandas all operations will be ends up in new Dataframe
### If you want to track them assign them to another dataframe

In [43]:
langs = pd.Series(
    ['French','German','Italian'],
    index = ['France','Germany','Italy'],
    name = 'Language'
)

In [44]:
langs

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [45]:
df['Language'] = langs

In [46]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.655,2167744,301336,0.873,Europe,Italian
Japan,127.0671,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


Replacing values per Column

In [47]:
df['Language']= 'English'

In [48]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.655,2167744,301336,0.873,Europe,English
Japan,127.0671,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


Renaming Columns

In [49]:
df.rename(
    columns={
        'HDI':'Human Development Index',
        'Anual Popcorn Consumption':'APC'
    }, index = {
        'United States':'USA',
        'United Kingdom': 'UK',
        'Argentina':'AR'
    }
)

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.655,2167744,301336,0.873,Europe,English
Japan,127.0671,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [50]:
df.rename(index = str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.655,2167744,301336,0.873,Europe,English
JAPAN,127.0671,4602367,377930,0.891,Asia,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


Creating Columns from combination of columns

In [51]:
### Create GDP Per capita
df['GDP_PER_CAPITA'] = df['GDP']/df['Population']

In [52]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP_PER_CAPITA
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.655,2167744,301336,0.873,Europe,English,35738.916825
Japan,127.0671,4602367,377930,0.891,Asia,English,36219.973542
United Kingdom,64.511,2950039,242495,0.907,Europe,English,45729.239975
United States,318.523,17348075,9525067,0.915,America,English,54464.12033


Statistical info

In [53]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP_PER_CAPITA
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.655,2167744,301336,0.873,Europe,English,35738.916825
Japan,127.0671,4602367,377930,0.891,Asia,English,36219.973542


In [56]:
population = df['Population']
population.mean()

107.3020142857143

In [57]:
population.std()

97.25097581259239

In [58]:
population.median()

64.511

In [59]:
population.quantile(.25)

62.303