In [1]:
# INTRODUCING DATAFRAMES

import pandas as pd

names = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
dr =  [True, False, False, False, True, True, True]
cpc = [809, 731, 588, 18, 200, 70, 45]
my_dict = {'country': names, 'drives_right': dr, 'cars_per_cap': cpc}

cars = pd.DataFrame(my_dict)

In [2]:
cars.shape    # this is an attribute 
cars.info()
cars.describe()    # quick statistics for the df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country       7 non-null      object
 1   drives_right  7 non-null      bool  
 2   cars_per_cap  7 non-null      int64 
dtypes: bool(1), int64(1), object(1)
memory usage: 247.0+ bytes


Unnamed: 0,cars_per_cap
count,7.0
mean,351.571429
std,345.595552
min,18.0
25%,57.5
50%,200.0
75%,659.5
max,809.0


In [3]:
# Parts of a DataFrame
cars.index
cars.columns
cars.values

array([['United States', True, 809],
       ['Australia', False, 731],
       ['Japan', False, 588],
       ['India', False, 18],
       ['Russia', True, 200],
       ['Morocco', True, 70],
       ['Egypt', True, 45]], dtype=object)

In [4]:
# Sorting rows
cars.sort_values("cars_per_cap", ascending = True)
cars.sort_values(["country", "drives_right"], ascending = [True, False])

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
6,Egypt,True,45
3,India,False,18
2,Japan,False,588
5,Morocco,True,70
4,Russia,True,200
0,United States,True,809


In [5]:
# Subsetting colums 
cars[["country","drives_right"]]    # select two colums
cars["cars_per_cap"] > 500    # display booleans
cars[cars["cars_per_cap"] > 500]    # filtering


Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588


In [13]:
# Subsetting based on multiple conditions
drives_right = cars.loc[(cars["drives_right"] == 'False')]
cars_per_cap_500 = cars.loc[(cars["cars_per_cap"] > 500)]

cars[drives_right & cars_per_cap_500]

Unnamed: 0,country,drives_right,cars_per_cap
0,,,
1,,,
2,,,
3,,,
4,,,
5,,,
6,,,


In [7]:
# Subsetting using .isin()
japan_and_us = cars["country"].isin(["Japan", "Australia"])
cars[japan_and_us]

Unnamed: 0,country,drives_right,cars_per_cap
1,Australia,False,731
2,Japan,False,588


In [22]:
# Adding new columns
cars["COUNTRY"] = cars["country"].str.upper()
cars


Unnamed: 0,country,drives_right,cars_per_cap,COUNTRY
0,United States,True,809,UNITED STATES
1,Australia,False,731,AUSTRALIA
2,Japan,False,588,JAPAN
3,India,False,18,INDIA
4,Russia,True,200,RUSSIA
5,Morocco,True,70,MOROCCO
6,Egypt,True,45,EGYPT


In [25]:
# SUMMARRY STATISTICS 

cars["cars_per_cap"].mean()
cars["cars_per_cap"].median()
cars["cars_per_cap"].max()
cars["cars_per_cap"].min()
cars["cars_per_cap"].mode()
cars["cars_per_cap"].var()
cars["cars_per_cap"].std()

345.59555222005633

In [32]:
# The .agg() method
def pct30(column):
    return column.quantile(0.3)
def pct50(column):
    return column.quantile(0.5)

cars["cars_per_cap"].agg([pct30, pct50])

pct30     65.0
pct50    200.0
Name: cars_per_cap, dtype: float64

In [33]:
cars["cars_per_cap"].cumsum()

0     809
1    1540
2    2128
3    2146
4    2346
5    2416
6    2461
Name: cars_per_cap, dtype: int64