In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Pandas has two data structures, the series and dataframe

### Panda Series

In [2]:
g7_pop = pd.Series([36.8, 50.8, 65.3, 69.7, 80.5, 88.65, 90.0])

In [3]:
g7_pop

0    36.80
1    50.80
2    65.30
3    69.70
4    80.50
5    88.65
6    90.00
dtype: float64

In [4]:
### Series can have a name
g7_pop.name = "G7 Population in Millions"

In [5]:
g7_pop

0    36.80
1    50.80
2    65.30
3    69.70
4    80.50
5    88.65
6    90.00
Name: G7 Population in Millions, dtype: float64

In [6]:
#to check the type

g7_pop.dtype

dtype('float64')

In [7]:
#you can also check the values for each index

g7_pop.values

array([36.8 , 50.8 , 65.3 , 69.7 , 80.5 , 88.65, 90.  ])

In [8]:
type(g7_pop.values)

numpy.ndarray

In [9]:
# you can also call elements inside the array just as in python list

g7_pop[0]

36.8

In [10]:
g7_pop[1]

50.8

In [11]:
## Pandas series only support positive indexing, so [-1] will return an error
g7_pop[-1]

KeyError: -1

In [12]:
# to get element in a pandas series from the back or end, use 'tail' method
g7_pop.tail(1)

# 1, will print out the last element

6    90.0
Name: G7 Population in Millions, dtype: float64

In [13]:
g7_pop.tail(2)

# 2, will print out the last two element

5    88.65
6    90.00
Name: G7 Population in Millions, dtype: float64

In [14]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

In [15]:
g7_pop

0    36.80
1    50.80
2    65.30
3    69.70
4    80.50
5    88.65
6    90.00
Name: G7 Population in Millions, dtype: float64

In [16]:
g7_pop[6]

90.0

In [17]:
# You can change or set the index of your pandas series


In [18]:
g7_pop.index = ['Canada', 'Brazil', 'Togo', 
                'Nigeria', 'Thailand', 'Japan', 'China']

In [19]:
g7_pop

Canada      36.80
Brazil      50.80
Togo        65.30
Nigeria     69.70
Thailand    80.50
Japan       88.65
China       90.00
Name: G7 Population in Millions, dtype: float64

In [20]:
# you can also create the series from scratch by assigning index, giving the series a name

In [21]:
flo = pd.Series([25, 28, 35, 20],
               index=['Bola', 'Tunde', 'Femi', 'Bukky'],
               name = 'family')

In [22]:
flo

Bola     25
Tunde    28
Femi     35
Bukky    20
Name: family, dtype: int64

In [23]:
# you can also create a new series from an existing series by specifying the index

new = pd.Series(flo, index=['Tunde', 'Bukky'])

In [24]:
new

Tunde    28
Bukky    20
Name: family, dtype: int64

In [25]:
new['Tunde']

28

In [26]:
# should incase, you still want to search for element based on their index number
# not the index name, use the method 'iloc'.
# with 'iloc' method, negative indexing will work

In [27]:
flo.iloc[0]

25

In [28]:
flo.iloc[-1]

20

In [29]:
flo

Bola     25
Tunde    28
Femi     35
Bukky    20
Name: family, dtype: int64

In [30]:
#you can also pass in multiple indexes at once
# this will require double square brackets
flo[['Bola', 'Tunde']]

Bola     25
Tunde    28
Name: family, dtype: int64

In [31]:
## slicing also works with pandas series with one difference from python list
# the upper limit is not printed in python list but in 
# pandas series, the upper limit is printed

flo['Bola':'Femi']

Bola     25
Tunde    28
Femi     35
Name: family, dtype: int64

### Operations and Methods

In [32]:
# you can do arithmetic operations on a panda series

g7_pop

Canada      36.80
Brazil      50.80
Togo        65.30
Nigeria     69.70
Thailand    80.50
Japan       88.65
China       90.00
Name: G7 Population in Millions, dtype: float64

In [33]:
g7_pop * 1_000_000

Canada      36800000.0
Brazil      50800000.0
Togo        65300000.0
Nigeria     69700000.0
Thailand    80500000.0
Japan       88650000.0
China       90000000.0
Name: G7 Population in Millions, dtype: float64

In Python, there is no difference between the numbers 1000000 and 1_000_000. Both representations are just different ways of writing the same integer value.

The underscore character (_) in Python is used as a visual separator for large numbers to enhance readability. When you write a number with underscores, Python ignores the underscores and 
treats the value the same as if the underscores were not present.

So, 1000000 and 1_000_000 represent the exact same integer value of one million. The use of underscores is purely for making the number more readable and easier to comprehend for humans, especially when dealing with large numbers.

In [34]:
g7_pop

Canada      36.80
Brazil      50.80
Togo        65.30
Nigeria     69.70
Thailand    80.50
Japan       88.65
China       90.00
Name: G7 Population in Millions, dtype: float64

In [35]:
g7_pop > 70

Canada      False
Brazil      False
Togo        False
Nigeria     False
Thailand     True
Japan        True
China        True
Name: G7 Population in Millions, dtype: bool

In [36]:
# pandas series is also mutable

g7_pop['Nigeria'] = 200

In [37]:
g7_pop

Canada       36.80
Brazil       50.80
Togo         65.30
Nigeria     200.00
Thailand     80.50
Japan        88.65
China        90.00
Name: G7 Population in Millions, dtype: float64

### Pandas DataFrame

In [38]:
# How to create a dataframe

In [39]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [40]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [41]:
# Create another dataframe

class4 = pd.DataFrame({
    'Last Name' : ['Owolabi', 'Akanle', 'Arikewuyo', 'Akinola'],
    'First Name' : ['Opeyemi', 'Eniola', 'Olashile', 'Temitope']
})

In [42]:
class4

Unnamed: 0,Last Name,First Name
0,Owolabi,Opeyemi
1,Akanle,Eniola
2,Arikewuyo,Olashile
3,Akinola,Temitope


In [43]:
class4.index = ['Supo', 'Supervisor', 'Analyst', 'Analyst']

In [44]:
class4

Unnamed: 0,Last Name,First Name
Supo,Owolabi,Opeyemi
Supervisor,Akanle,Eniola
Analyst,Arikewuyo,Olashile
Analyst,Akinola,Temitope


In [45]:
#try calling the dataframe with index number
class4[1]

KeyError: 1

In [46]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [47]:
class4.columns

Index(['Last Name', 'First Name'], dtype='object')

In [48]:
class4.index

Index(['Supo', 'Supervisor', 'Analyst', 'Analyst'], dtype='object')

In [49]:
class4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Supo to Analyst
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Last Name   4 non-null      object
 1   First Name  4 non-null      object
dtypes: object(2)
memory usage: 96.0+ bytes


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 408.0+ bytes


In [51]:
df.size

35

In [52]:
df.shape

(7, 5)

In [53]:
class4.shape

(4, 2)

In [54]:
df.describe() 
#describe() method will perform statistical calculation

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [55]:
class4.describe()

Unnamed: 0,Last Name,First Name
count,4,4
unique,4,4
top,Owolabi,Opeyemi
freq,1,1


In [56]:
class4.loc['Supo']

Last Name     Owolabi
First Name    Opeyemi
Name: Supo, dtype: object

In [57]:
df.loc[1]

Population       63.951
GDP             2833687
Surface Area     640679
HDI               0.888
Continent        Europe
Name: 1, dtype: object

In [58]:
df.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: 6, dtype: object

In [59]:
# loc is used to call values (rows) in the variable by their key
#iloc method is use to call values (rows) in the dict by their index 

In [60]:
class4['Last Name']

Supo            Owolabi
Supervisor       Akanle
Analyst       Arikewuyo
Analyst         Akinola
Name: Last Name, dtype: object

In [61]:
# you can also call the values (column name) in the dict by their 'key'

In [62]:
certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})

certificates_earned.index = ['Tom', 'Kris', 'Ahmad', 'Beau']


In [63]:
certificates_earned

Unnamed: 0,Certificates,Time (in months)
Tom,8,16
Kris,2,5
Ahmad,5,9
Beau,6,12


In [64]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [65]:
df['Population'] > 70

0    False
1    False
2     True
3    False
4     True
5    False
6     True
Name: Population, dtype: bool

In [66]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
2,80.94,3874437,357114,0.916,Europe
4,127.061,4602367,377930,0.891,Asia
6,318.523,17348075,9525067,0.915,America


In [67]:
df.loc[df['Population'] > 70, 'HDI']

2    0.916
4    0.891
6    0.915
Name: HDI, dtype: float64

#### Dropping stuff

Dropping which is done by the 'drop()' method is like removing a particular column or row from the new dataframe
The main dataframe will retain its original format

In [68]:
df.drop(1)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [69]:
class4.drop('Supo')

Unnamed: 0,Last Name,First Name
Supervisor,Akanle,Eniola
Analyst,Arikewuyo,Olashile
Analyst,Akinola,Temitope


In [70]:
class4

Unnamed: 0,Last Name,First Name
Supo,Owolabi,Opeyemi
Supervisor,Akanle,Eniola
Analyst,Arikewuyo,Olashile
Analyst,Akinola,Temitope


In [71]:
# the new dataframe can be assigned to another dataframe

class5 = df.drop(3)

In [72]:
class5

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [73]:
# you can drop more than one row or column at a time
df.drop([2,5])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
6,318.523,17348075,9525067,0.915,America


In [74]:
# to drop a column, 'columns=' method will be used
df.drop(columns=['GDP', 'Continent'])

Unnamed: 0,Population,Surface Area,HDI
0,35.467,9984670,0.913
1,63.951,640679,0.888
2,80.94,357114,0.916
3,60.665,301336,0.873
4,127.061,377930,0.891
5,64.511,242495,0.907
6,318.523,9525067,0.915


In [75]:
# you can merge a series and a dataframe together if they have thesame index value

In [76]:
df[['Population','GDP']]

Unnamed: 0,Population,GDP
0,35.467,1785387
1,63.951,2833687
2,80.94,3874437
3,60.665,2167744
4,127.061,4602367
5,64.511,2950039
6,318.523,17348075


In [77]:
# assigning index name to each row
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [78]:
df.index=['Canada', 'France', 'Germany', 'Italy', 'Japan', 
         'UK', 'USA']

In [79]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
USA,318.523,17348075,9525067,0.915,America


#### Adding a New Column

In [82]:
# define your new column with a pandas series

lang = pd.Series(['French', 'German', 'Italian', 'Japanese', 
                  'English'],
                 index=['France', 'Germany', 'Italy', 'Japan',
                  'UK']
                )

In [83]:
lang

France       French
Germany      German
Italy       Italian
Japan      Japanese
UK          English
dtype: object

In [84]:
df['Language'] = lang

In [85]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,Japanese
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,


In [86]:
# you can change the whole of a column at once
df['Language'] = 'English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [87]:
## You can also rename a column or index using the 'rename' method

In [88]:
df.rename(
columns={'GDP' : 'Gross DP',
        'Popo' : 'Popolut'},
        index={'USA': 'United SA',
               'China': 'Chinese'
            
        })

Unnamed: 0,Population,Gross DP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
United SA,318.523,17348075,9525067,0.915,America,English


In [89]:
# exercise to test knowledge

certificates_earned = pd.DataFrame({
    'Certificates': [8, 2, 5, 6],
    'Time (in months)': [16, 5, 9, 12]
})
names = ['Tom', 'Kris', 'Ahmad', 'Beau']

certificates_earned.index = names
longest_streak = pd.Series([13, 11, 9, 7], index=names)
certificates_earned['Longest streak'] = longest_streak


# this dataframe is expected to have two columns and 4 rows
# the index has been changed to values of the variable 'name'
# a new column which was defined by a pandas series has been added

print(certificates_earned)

       Certificates  Time (in months)  Longest streak
Tom               8                16              13
Kris              2                 5              11
Ahmad             5                 9               9
Beau              6                12               7


### Creating columns from other columns

we can create a new column from other existing column

In [91]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [96]:
df[['GDP','Population']]

Unnamed: 0,GDP,Population
Canada,1785387,35.467
France,2833687,63.951
Germany,3874437,80.94
Italy,2167744,60.665
Japan,4602367,127.061
UK,2950039,64.511
USA,17348075,318.523


In [97]:
df['GDP'] / df['Population']

Canada     50339.385908
France     44310.284437
Germany    47868.013343
Italy      35733.025633
Japan      36221.712406
UK         45729.239975
USA        54464.120330
dtype: float64

The result of the above can be assigned to a new column


In [98]:
df['GDP/Population'] = df['GDP'] / df['Population']

In [99]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP/Population
Canada,35.467,1785387,9984670,0.913,America,English,50339.385908
France,63.951,2833687,640679,0.888,Europe,English,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,English,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,English,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,English,36221.712406
UK,64.511,2950039,242495,0.907,Europe,English,45729.239975
USA,318.523,17348075,9525067,0.915,America,English,54464.12033


In [100]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP/Population
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [101]:
# you can also perform statistical ananlysis on a dataframe

In [102]:
df.max()

Population            318.523
GDP                  17348075
Surface Area          9984670
HDI                     0.916
Continent              Europe
Language              English
GDP/Population    54464.12033
dtype: object

In [104]:
df.min()

Population              35.467
GDP                    1785387
Surface Area            242495
HDI                      0.873
Continent              America
Language               English
GDP/Population    35733.025633
dtype: object

In [107]:
df['Population'].sum()

751.118

In [108]:
df['GDP'].min()

1785387

In [109]:
df['Population'].describe()

count      7.000000
mean     107.302571
std       97.249970
min       35.467000
25%       62.308000
50%       64.511000
75%      104.000500
max      318.523000
Name: Population, dtype: float64

### Using Matplotlib in jupyter notebook

you can read different file type (escel, csv, sql) in python using pandas

In [4]:
pd.read_csv