# Importing pandas

In [11]:
import pandas as pd
import numpy as np

# Using Series

In [15]:
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])

In [19]:
g7_pop.name = 'G7 Population in millions'

g7_pop.index =[
    'Canada', 'France', 'Germany', 'Italy', 'Japan', 'UK', 'USA' 
] 

In [18]:
g7_pop

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
USA        318.523
Name: G7 Population in millions, dtype: float64

In [21]:
g7_pop['Canada']

35.467

In [22]:
g7_pop.iloc[-1]

318.523

# Using DF

In [25]:


df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [27]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]


In [28]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [29]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [30]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [32]:
df.size

35

In [33]:
df.shape # (rows, columns)

(7, 5)

In [39]:
df.describe() # Summary of statistics

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [37]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [41]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
Name: count, dtype: int64

# DF operations

In [42]:
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [43]:
df.iloc[0]

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [53]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [58]:
df.loc['France' : 'Italy', 'Population'].to_frame()

Unnamed: 0,Population
France,63.951
Germany,80.94
Italy,60.665


In [60]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [68]:
df.loc[df['Population'] > 70, ['Population']]

Unnamed: 0,Population
Germany,80.94
Japan,127.061
United States,318.523


In [72]:
df['Population']+=10

In [71]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,25.467,1785387,9984670,0.913,America
France,53.951,2833687,640679,0.888,Europe
Germany,70.94,3874437,357114,0.916,Europe
Italy,50.665,2167744,301336,0.873,Europe
Japan,117.061,4602367,377930,0.891,Asia
United Kingdom,54.511,2950039,242495,0.907,Europe
United States,308.523,17348075,9525067,0.915,America


In [85]:
df = df.rename(
    columns={
        'HDI': 'IDH',
        'GDP' : 'PIB',
        'Population': 'Poblacion',
        'Continent': 'Continente',
        'Surface Area': 'Area'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Canada': 'CA',
        'Argentina': 'AR',
        'France':'FR',
    })

In [86]:
df.rename(index=str.upper)

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente
CA,35.467,1785387,9984670,0.913,America
FR,63.951,2833687,640679,0.888,Europe
GERMANY,80.94,3874437,357114,0.916,Europe
ITALY,60.665,2167744,301336,0.873,Europe
JAPAN,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
USA,318.523,17348075,9525067,0.915,America


In [97]:
df['Language'] = 'English' # Add Column
df

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente,Language
CA,35.467,1785387,9984670,0.913,America,English
FR,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [98]:
df.drop(columns='Language', inplace=True) # Delete Column
df

In [102]:
df.loc['China'] = pd.Series({'Poblacion': 1_400_000_000, 'Continente': 'Asia'}) # Add row

In [104]:
df.drop('China', inplace=True) # Delete row

In [106]:
df

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente,IDH Per Capita
CA,35.467,1785387.0,9984670.0,0.913,America,0.025742
FR,63.951,2833687.0,640679.0,0.888,Europe,0.013886
Germany,80.94,3874437.0,357114.0,0.916,Europe,0.011317
Italy,60.665,2167744.0,301336.0,0.873,Europe,0.014391
Japan,127.061,4602367.0,377930.0,0.891,Asia,0.007012
UK,64.511,2950039.0,242495.0,0.907,Europe,0.01406
USA,318.523,17348075.0,9525067.0,0.915,America,0.002873


In [107]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['FR', 'Germany', 'Italy'],
    name='Language'
)

df['Language'] = langs


In [108]:
df

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente,IDH Per Capita,Language
CA,35.467,1785387.0,9984670.0,0.913,America,0.025742,
FR,63.951,2833687.0,640679.0,0.888,Europe,0.013886,French
Germany,80.94,3874437.0,357114.0,0.916,Europe,0.011317,German
Italy,60.665,2167744.0,301336.0,0.873,Europe,0.014391,Italian
Japan,127.061,4602367.0,377930.0,0.891,Asia,0.007012,
UK,64.511,2950039.0,242495.0,0.907,Europe,0.01406,
USA,318.523,17348075.0,9525067.0,0.915,America,0.002873,


In [110]:
df.drop(columns='Language', inplace=True)

In [111]:
df

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente,IDH Per Capita
CA,35.467,1785387.0,9984670.0,0.913,America,0.025742
FR,63.951,2833687.0,640679.0,0.888,Europe,0.013886
Germany,80.94,3874437.0,357114.0,0.916,Europe,0.011317
Italy,60.665,2167744.0,301336.0,0.873,Europe,0.014391
Japan,127.061,4602367.0,377930.0,0.891,Asia,0.007012
UK,64.511,2950039.0,242495.0,0.907,Europe,0.01406
USA,318.523,17348075.0,9525067.0,0.915,America,0.002873


In [137]:
df['IDH Per Capita'] = round(df['IDH'] / df['Poblacion'],6)
df

Unnamed: 0,Poblacion,PIB,Area,IDH,Continente,IDH Per Capita
CA,35.467,1785387.0,9984670.0,0.913,America,0.025742
FR,63.951,2833687.0,640679.0,0.888,Europe,0.013886
Germany,80.94,3874437.0,357114.0,0.916,Europe,0.011317
Italy,60.665,2167744.0,301336.0,0.873,Europe,0.014391
Japan,127.061,4602367.0,377930.0,0.891,Asia,0.007012
UK,64.511,2950039.0,242495.0,0.907,Europe,0.01406
USA,318.523,17348075.0,9525067.0,0.915,America,0.002873


In [113]:
population = df['Poblacion']

In [116]:
population.min(), population.max()


(35.467, 318.523)

In [118]:
population.sum()

751.118

In [119]:
population.sum() / len(population)

107.30257142857144

In [120]:
population.mean()

107.30257142857144

In [121]:
population.std()

97.24996987121581

In [122]:
population.median()

64.511

In [124]:
population.var()

9457.556639952383

In [126]:
population.describe().to_frame()

Unnamed: 0,Poblacion
count,7.0
mean,107.302571
std,97.24997
min,35.467
25%,62.308
50%,64.511
75%,104.0005
max,318.523


In [127]:
population.quantile(.25)

62.308

In [128]:
population.quantile([.2, .4, .6, .8, 1])



0.2     61.3222
0.4     64.1750
0.6     74.3684
0.8    117.8368
1.0    318.5230
Name: Poblacion, dtype: float64

# Data cleaning

## nan and inf

In [138]:
falsy_values = (0, False, None, '', [], {})


For Python, all the values above are considered "falsy":


In [139]:
any(falsy_values)

False

### How they work

Numpy has a special "nullable" value for numbers which is np.nan. It's NaN: "Not a number"



In [140]:
np.nan

nan

The np.nan value is kind of a virus. Everything that it touches becomes np.nan:

In [142]:
3 + np.nan

nan

In [143]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

In [144]:
a.sum()

nan

In [145]:
a.mean()

nan

This is better than regular None values, which in the previous examples would have raised an exception:

In [146]:

3 + None

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

For a numeric array, the None value is replaced by np.nan:


In [147]:

a = np.array([1, 2, 3, np.nan, None, 4], dtype='float')
a

array([ 1.,  2.,  3., nan, nan,  4.])

Numpy also supports an "Infinite" type which also behaves as a virus::


In [151]:
np.inf

inf

In [152]:
3 + np.inf

inf

In [153]:
np.inf / 3

inf

In [154]:
np.inf / np.inf

nan

In [156]:
b = np.array([1, 2, 3, np.inf, np.nan, 4], dtype=float)

b.sum()

nan

### Checking for them

There are two functions: np.isnan and np.isinf that will perform the desired checks:

In [None]:
np.isnan(np.nan)

In [None]:
np.isinf(np.inf)

Whenever you're trying to perform an operation with a Numpy array and you know there might be missing values, you'll need to filter them out before proceeding, to avoid nan propagation. We'll use a combination of the previous np.isnan + boolean arrays for this purpose:

In [None]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

a[~np.isnan(a)]

Which is equivalent to:

In [None]:
a[np.isfinite(a)]


And with that result, all the operation can be now performed:

In [None]:
a[np.isfinite(a)].sum()

In [None]:
a[np.isfinite(a)].mean()

## Process of cleaning