In [45]:
# INTRODUCING DATAFRAMES

import pandas as pd

names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
my_dict = {'country': names, 'drives_right': dr, 'cars_per_cap': cpc}

cars = pd.DataFrame(my_dict)

In [46]:
cars.shape    # this is an attribute 
cars.info()
cars.describe()    # quick statistics for the df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       7 non-null      object
 1   drives_right  7 non-null      bool  
 2   cars_per_cap  7 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 247.0+ bytes


Unnamed: 0,cars_per_cap
count,7.0
mean,351.571429
std,345.595552
min,18.0
25%,57.5
50%,200.0
75%,659.5
max,809.0


In [47]:
# Parts of a DataFrame
cars.index
cars.columns
cars.values

array([['United States', True, 809],
       ['Australia', False, 731],
       ['Japan', False, 588],
       ['India', False, 18],
       ['Russia', True, 200],
       ['Morocco', True, 70],
       ['Egypt', True, 45]], dtype=object)

In [48]:
# Sorting rows
cars.sort_values("cars_per_cap", ascending = True)
cars.sort_values(["country", "drives_right"], ascending = [True, False])

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
6,Egypt,True,45
3,India,False,18
2,Japan,False,588
5,Morocco,True,70
4,Russia,True,200
0,United States,True,809


In [49]:
# Subsetting colums 
cars[["country","drives_right"]]    # select two colums
cars["cars_per_cap"] > 500    # display booleans
cars[cars["cars_per_cap"] > 500]    # filtering


Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588


In [50]:
# Subsetting based on multiple conditions
drives_right = cars.loc[(cars["drives_right"] == 'False')]
cars_per_cap_500 = cars.loc[(cars["cars_per_cap"] > 500)]

cars[drives_right & cars_per_cap_500]

Unnamed: 0,country,drives_right,cars_per_cap
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,


In [51]:
# Subsetting using .isin()
japan_and_us = cars["country"].isin(["Japan", "Australia"])
cars[japan_and_us]

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
2,Japan,False,588


In [52]:
# Adding new columns
cars["COUNTRY"] = cars["country"].str.upper()
cars


Unnamed: 0,country,drives_right,cars_per_cap,COUNTRY
0,United States,True,809,UNITED STATES
1,Australia,False,731,AUSTRALIA
2,Japan,False,588,JAPAN
3,India,False,18,INDIA
4,Russia,True,200,RUSSIA
5,Morocco,True,70,MOROCCO
6,Egypt,True,45,EGYPT


In [53]:
# SUMMARRY STATISTICS 

cars["cars_per_cap"].mean()
cars["cars_per_cap"].median()
cars["cars_per_cap"].max()
cars["cars_per_cap"].min()
cars["cars_per_cap"].mode()
cars["cars_per_cap"].var()
cars["cars_per_cap"].std()

345.59555222005633

In [54]:
# The .agg() method
# allows you to apply your own custom functions to a DataFrame

def pct30(column):
    return column.quantile(0.3)
def pct50(column):
    return column.quantile(0.5)

cars["cars_per_cap"].agg([pct30, pct50])    # can imply to multiples columns

pct30     65.0
pct50    200.0
Name: cars_per_cap, dtype: float64

In [55]:
# Cumulative sum
cars["cars_per_cap"].cumsum()

0     809
1    1540
2    2128
3    2146
4    2346
5    2416
6    2461
Name: cars_per_cap, dtype: int64

In [56]:
# Counting

# Drop duplicate names
cars.drop_duplicates(subset="drives_right")    # subset = determine the criteria, can be multiple

Unnamed: 0,country,drives_right,cars_per_cap,COUNTRY
0,United States,True,809,UNITED STATES
1,Australia,False,731,AUSTRALIA


In [57]:
cars["country"].value_counts()   
cars["drives_right"].value_counts(sort=True) 

True     4
False    3
Name: drives_right, dtype: int64

In [58]:
# Proportions
cars["drives_right"].value_counts(normalize=True) 

True     0.571429
False    0.428571
Name: drives_right, dtype: float64

In [59]:
# Grouped summary statistics 
cars[cars["drives_right"] == True]["cars_per_cap"].mean()
# and more elegant...
cars.groupby("drives_right")["cars_per_cap"].mean()

drives_right
False    445.666667
True     281.000000
Name: cars_per_cap, dtype: float64

In [60]:
# Can be combined with the agg method
cars.groupby("drives_right")["cars_per_cap"].agg([min, max, sum])

Unnamed: 0_level_0,min,max,sum
drives_right,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,18,731,1337
True,45,809,1124


In [61]:
cars.groupby(["drives_right","country"])["cars_per_cap"].mean()

drives_right  country      
False         Australia        731.0
              India             18.0
              Japan            588.0
True          Egypt             45.0
              Morocco           70.0
              Russia           200.0
              United States    809.0
Name: cars_per_cap, dtype: float64

In [62]:
# Pivot Tables 

cars.pivot_table(values="cars_per_cap", index="drives_right")    # values=summarized data, index=column you want to group
# NB: mean is the default displayed value

Unnamed: 0_level_0,cars_per_cap
drives_right,Unnamed: 1_level_1
False,445.666667
True,281.0


In [63]:
import numpy as np
cars.pivot_table(values="cars_per_cap", index="drives_right", aggfunc=[np.mean, np.median])    # aggfunc to change the nature of the statistics

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,cars_per_cap,cars_per_cap
drives_right,Unnamed: 1_level_2,Unnamed: 2_level_2
False,445.666667,588
True,281.0,135


In [64]:
# Pivot on two variables
cars.pivot_table(values="cars_per_cap", index="drives_right", columns="country", fill_value=0, margins=True)
# fill value to replace the NaN
# margin = mean value of all row/colum elements

country,Australia,Egypt,India,Japan,Morocco,Russia,United States,All
drives_right,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,731,0,18,588,0,0,0,445.666667
True,0,45,0,0,70,200,809,281.0
All,731,45,18,588,70,200,809,351.571429


In [65]:
# EXPLICIT INDEXES

cars_ind = cars.set_index("country")   # setting a column as the index
cars_ind


Unnamed: 0_level_0,drives_right,cars_per_cap,COUNTRY
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,True,809,UNITED STATES
Australia,False,731,AUSTRALIA
Japan,False,588,JAPAN
India,False,18,INDIA
Russia,True,200,RUSSIA
Morocco,True,70,MOROCCO
Egypt,True,45,EGYPT


In [66]:
cars_ind.reset_index()   # reset the index 
# drop = True as a parameter removes definitely the index set

Unnamed: 0,country,drives_right,cars_per_cap,COUNTRY
0,United States,True,809,UNITED STATES
1,Australia,False,731,AUSTRALIA
2,Japan,False,588,JAPAN
3,India,False,18,INDIA
4,Russia,True,200,RUSSIA
5,Morocco,True,70,MOROCCO
6,Egypt,True,45,EGYPT


In [67]:
# Indexes make subsetting simpler
cars_ind.loc[["Australia","Japan"]]   # example with loc

Unnamed: 0_level_0,drives_right,cars_per_cap,COUNTRY
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,False,731,AUSTRALIA
Japan,False,588,JAPAN


In [71]:
# Multi-level indexes
cars_ind2 = cars.set_index(["country","drives_right"])   # note the []
cars_ind2

Unnamed: 0_level_0,Unnamed: 1_level_0,cars_per_cap,COUNTRY
country,drives_right,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,True,809,UNITED STATES
Australia,False,731,AUSTRALIA
Japan,False,588,JAPAN
India,False,18,INDIA
Russia,True,200,RUSSIA
Morocco,True,70,MOROCCO
Egypt,True,45,EGYPT


In [76]:
# Subset inner levels with a list of tuples
cars_ind2.loc[[("Australia", False), ("Japan", False)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,cars_per_cap,COUNTRY
country,drives_right,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,False,731,AUSTRALIA
Japan,False,588,JAPAN


In [79]:
# Sorting by index values
cars_ind2.sort_index(level=["country", "drives_right"], ascending=[False, False])


Unnamed: 0_level_0,Unnamed: 1_level_0,cars_per_cap,COUNTRY
country,drives_right,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,True,809,UNITED STATES
Russia,True,200,RUSSIA
Morocco,True,70,MOROCCO
Japan,False,588,JAPAN
India,False,18,INDIA
Egypt,True,45,EGYPT
Australia,False,731,AUSTRALIA
