# Pandas Basics


In [1]:
import io
import pathlib
import requests
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
ROOT_DIR = pathlib.Path("..")

EXAMPLES_URL = "https://github.com/KAUST-Academy/python-for-data-analysis/raw/moi-bridging-program-cohort-2/examples.zip"

response = requests.get(EXAMPLES_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(ROOT_DIR)

In [3]:
EXAMPLES_DIR = ROOT_DIR / "examples"

In [4]:
!ls $EXAMPLES_DIR 

array_ex.txt	ex4.csv			    ipython_bug.py  test_file.csv
csv_mindex.csv	ex5.csv			    macrodata.csv   tips.csv
ex1.csv		ex6.csv			    segismundo.txt  tseries.csv
ex1.xlsx	ex7.csv			    spx.csv	    volume.csv
ex2.csv		example.json		    stinkbug.png    yahoo_price.pkl
ex3.txt		fdic_failed_bank_list.html  stock_px.csv    yahoo_volume.pkl


## Pandas Data Structures

In [5]:
s0 = pd.Series([4, 7, -5, 3])
s0

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
s0.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
s1 = pd.Series([4, 5, -6, 3], index=["d", "c", "a", "b"])

In [8]:
s1

d    4
c    5
a   -6
b    3
dtype: int64

In [9]:
s1['a']

-6

In [10]:
s1 > 0

d     True
c     True
a    False
b     True
dtype: bool

In [11]:
s1[s1 > 0]

d    4
c    5
b    3
dtype: int64

In [12]:
np.exp(s1)

d     54.598150
c    148.413159
a      0.002479
b     20.085537
dtype: float64

In [15]:
data = {
    "Makkah": 35000,
    "Riyadh": 50000,
    "Eastern": 34000,
    "Tabuk": 61234.,
    "Jazan": 12312,
    "Al Jouf": 13455,
    "Asir": 636723,
    "Medina": 657845,
}

s2 = pd.Series(data)

In [16]:
s2

Makkah      35000.0
Riyadh      50000.0
Eastern     34000.0
Tabuk       61234.0
Jazan       12312.0
Al Jouf     13455.0
Asir       636723.0
Medina     657845.0
dtype: float64

In [17]:
s3 = pd.Series(
    data,
    index=[
        "Makkah",
        "Riyadh",
        "Eastern",
        "Tabuk",
        "Jazan",
        "Al Jouf",
        "Asir",
        "Medina",
        "Qassim",
    ]
)

In [18]:
s3

Makkah      35000.0
Riyadh      50000.0
Eastern     34000.0
Tabuk       61234.0
Jazan       12312.0
Al Jouf     13455.0
Asir       636723.0
Medina     657845.0
Qassim          NaN
dtype: float64

In [19]:
s3.to_dict()

{'Makkah': 35000.0,
 'Riyadh': 50000.0,
 'Eastern': 34000.0,
 'Tabuk': 61234.0,
 'Jazan': 12312.0,
 'Al Jouf': 13455.0,
 'Asir': 636723.0,
 'Medina': 657845.0,
 'Qassim': nan}

In [20]:
s3.isna()

Makkah     False
Riyadh     False
Eastern    False
Tabuk      False
Jazan      False
Al Jouf    False
Asir       False
Medina     False
Qassim      True
dtype: bool

In [21]:
s3.name = "Population"
s3.index.name = "Province"

In [22]:
s3

Province
Makkah      35000.0
Riyadh      50000.0
Eastern     34000.0
Tabuk       61234.0
Jazan       12312.0
Al Jouf     13455.0
Asir       636723.0
Medina     657845.0
Qassim          NaN
Name: Population, dtype: float64

### Exercise

Create a Series whose index is all the provinces in Saudi Arabia and whose values are the actual population values for the provinces.

In [23]:
data = {
    "Mecca Region": 8_557_766,
    "Riyadh Region": 8_216_284,
    "Eastern Region": 4_900_325,
    "'Asir Region": 2_211_875,
    "Jazan Region": 1_567_547,
    "Medina Region": 1_423_935,
    "Al-Qassim Region": 1_423_935,
    "Tabuk Region":	910_030,
    "Ha'il Region": 699_774,
    "Najran Region": 582_243,
    "Al-Jawf Region":	508_475,
    "Al-Bahah Region": 476_172,
    "Northern Borders": 365_231,
}

s4 = pd.Series(data, name="Population")

In [24]:
s4

Mecca Region        8557766
Riyadh Region       8216284
Eastern Region      4900325
'Asir Region        2211875
Jazan Region        1567547
Medina Region       1423935
Al-Qassim Region    1423935
Tabuk Region         910030
Ha'il Region         699774
Najran Region        582243
Al-Jawf Region       508475
Al-Bahah Region      476172
Northern Borders     365231
Name: Population, dtype: int64

In [25]:
s4.sum()

31843592

In [26]:
s4.mean()

2449507.076923077

In [27]:
s4.max()

8557766

In [28]:
s4.idxmax()

'Mecca Region'

In [29]:
data = {
    "province": ["Makkah", "Makkah", "Makkah", "Riyadh", "Riyadh"],
    "population": [8557766, 8557766, 8557766, 8216284, 8216284],
    "area (km^2)": [153128, 153128, 153128, 404240, 404240]
}

df0 = pd.DataFrame(data)

In [30]:
df0

Unnamed: 0,province,population,area (km^2)
0,Makkah,8557766,153128
1,Makkah,8557766,153128
2,Makkah,8557766,153128
3,Riyadh,8216284,404240
4,Riyadh,8216284,404240


In [32]:
df0.head(n=3)

Unnamed: 0,province,population,area (km^2)
0,Makkah,8557766,153128
1,Makkah,8557766,153128
2,Makkah,8557766,153128


In [33]:
df0.tail(n=2)

Unnamed: 0,province,population,area (km^2)
3,Riyadh,8216284,404240
4,Riyadh,8216284,404240


In [34]:
df0.columns

Index(['province', 'population', 'area (km^2)'], dtype='object')

In [35]:
df0.index

RangeIndex(start=0, stop=5, step=1)

In [38]:
df1 = pd.DataFrame(data, columns=["province", "population", "area (km^2)", "gdp"]) 

In [39]:
df1

Unnamed: 0,province,population,area (km^2),gdp
0,Makkah,8557766,153128,
1,Makkah,8557766,153128,
2,Makkah,8557766,153128,
3,Riyadh,8216284,404240,
4,Riyadh,8216284,404240,


In [41]:
df1["population"]

0    8557766
1    8557766
2    8557766
3    8216284
4    8216284
Name: population, dtype: int64

In [42]:
df1.loc[2, "province"]

'Makkah'

In [43]:
df1.loc[1:3, :]

Unnamed: 0,province,population,area (km^2),gdp
1,Makkah,8557766,153128,
2,Makkah,8557766,153128,
3,Riyadh,8216284,404240,


In [44]:
df1["gdp"] = 0.0

In [45]:
df1

Unnamed: 0,province,population,area (km^2),gdp
0,Makkah,8557766,153128,0.0
1,Makkah,8557766,153128,0.0
2,Makkah,8557766,153128,0.0
3,Riyadh,8216284,404240,0.0
4,Riyadh,8216284,404240,0.0


In [46]:
del df1["gdp"]

In [47]:
df1

Unnamed: 0,province,population,area (km^2)
0,Makkah,8557766,153128
1,Makkah,8557766,153128
2,Makkah,8557766,153128
3,Riyadh,8216284,404240
4,Riyadh,8216284,404240


In [None]:
###

### Exercise

Make the table into a Pandas DataFrame.

In [52]:
columns = [
    "Historic Region",
    "Regions",
    "Capital",
    "Governorates",
    "Marakiz",
    "Population (2017 Census)",
    "Approximate area (km2)"
]

data = {
    0: ["Hejaz", "Mecca Region", "Makkah", 16, 111, 8_557_766, 153_128],
    1: ["Najd", "Riyadh Region", "Riyadh", 21, 453, 8_216_284, 404_240],
    2: ["Eastern Arabia", "Eastern Region", "Dammam", 11, 107, 4_900_325, 672_522],
    3: ["Southern Arabia", "'Asir Region", "Abha", 16, 101, 2_211_875, 76_693],
    4: ["Southern Arabia", "Jazan Region", "Jazan", 16, 31, 1_567_547, 11_671],
    5: ["Hejaz", "Medina Region", "Madinah", 8,	90, 1_423_935, 151_990],
    6: ["Najd", "Al-Qassim Region", "Buraidah", 12,	153, 1_423_935, 58_046],
    7: ["Hejaz", "Tabuk Region", "Tabuk", 6, 73, 910_030, 146_072],
    8: ["Najd", "Ha'il Region", "Ha'il", 8,	84, 699_774, 103_887],
    9: ["Southern Arabia", "Najran Region", "Najran", 7, 59, 582_243, 149_511],
    10: ["Badiah", "Al-Jawf Region", "Sakaka", 3, 33, 508_475, 100_212],
    11: ["Hejaz", "Al-Bahah Region", "Al-Baha", 9, 35, 476_172, 9_921],
    12: ["Badiah", "Northern Borders Region", "Arar", 3, 17, 365_231, 111_797],
}

df2 = pd.DataFrame.from_dict(data, orient="index", columns=columns)

In [53]:
df2

Unnamed: 0,Historic Region,Regions,Capital,Governorates,Marakiz,Population (2017 Census),Approximate area (km2)
0,Hejaz,Mecca Region,Makkah,16,111,8557766,153128
1,Najd,Riyadh Region,Riyadh,21,453,8216284,404240
2,Eastern Arabia,Eastern Region,Dammam,11,107,4900325,672522
3,Southern Arabia,'Asir Region,Abha,16,101,2211875,76693
4,Southern Arabia,Jazan Region,Jazan,16,31,1567547,11671
5,Hejaz,Medina Region,Madinah,8,90,1423935,151990
6,Najd,Al-Qassim Region,Buraidah,12,153,1423935,58046
7,Hejaz,Tabuk Region,Tabuk,6,73,910030,146072
8,Najd,Ha'il Region,Ha'il,8,84,699774,103887
9,Southern Arabia,Najran Region,Najran,7,59,582243,149511
