# Intro to Pandas

Pandas is an open source library. It provides data structures that provide fast and efficient way of doing various data manipulation tasks.

Official website: <br>
https://pandas.pydata.org/



### Install and Import Pandas

In [None]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.5.2-cp39-cp39-win_amd64.whl (10.9 MB)
     ---------------------------------------- 10.9/10.9 MB 1.3 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.7-py2.py3-none-any.whl (499 kB)
     -------------------------------------- 499.4/499.4 kB 1.2 MB/s eta 0:00:00
Installing collected packages: pytz, pandas
Successfully installed pandas-1.5.2 pytz-2022.7


In [None]:
import pandas as pd

pd.__version__

'1.5.2'

### Series <br>
Series are 1-D arrays in pandas.
We can store any number, string, list, dictionary, numpy arrays etc in a series.

In [None]:
import numpy as np
import pandas as pd

#### Create a series

In [None]:
series_test = pd.Series([1,2,3,4,5]) # create series from list
series_test

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [None]:
another_series = pd.Series(np.arange(5)) # create series from numpy array
another_series

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [None]:
series_test.index # get index of the series

RangeIndex(start=0, stop=5, step=1)

In [None]:
series_test.values # get the values of series

array([1, 2, 3, 4, 5], dtype=int64)

In [None]:
series_dict = pd.Series({'a':1, 'b':2}) # create series from dictionary
series_dict

a    1
b    2
dtype: int64

In [None]:
print(series_dict.index) # keys of dictionary become index
print(series_dict.values) # values of dictionary are also the values of the series

Index(['a', 'b'], dtype='object')
[1 2]


In [None]:
cars_prod = pd.Series([7000,3000,4000,5000,4000],index=['Ford','Lexus','Volvo','Chervolet','Kia']) # create series with custom indexes
cars_prod

Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
dtype: int64

In [None]:
cars_prod_dict = cars_prod.to_dict() # convert series to dictionary
cars_prod_dict

{'Ford': 7000, 'Lexus': 3000, 'Volvo': 4000, 'Chervolet': 5000, 'Kia': 4000}

In [None]:
cars_prod_series = pd.Series(cars_prod_dict) # change dictionary back to series
cars_prod_series

Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
dtype: int64

In [None]:
cars_prod.name = "Car Production" # assign a name to series
cars_prod

Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
Name: Car Production, dtype: int64

In [None]:
cars_prod.index.name = 'Cars' #assign a name to index
cars_prod

Cars
Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
Name: Car Production, dtype: int64

### DataFrames

DataFrames are another core object of Pandas

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [None]:
# get some test data from wikipedia to run our examples

from pandas import read_html
website = 'https://en.wikipedia.org/wiki/List_of_highest_points_of_Asian_countries'
webpage_list = pd.io.html.read_html(website) # returns a list of dataframes
highest_points = webpage_list[0].head(10) # get the first 10 rows to work with
highest_points

Unnamed: 0,Rank,Country,Highest point,Elevation
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
# you can also get the data by using the read_clipboard method (cannot be used in google colab)

# 1.open the website and copy the table
import webbrowser
website = 'https://en.wikipedia.org/wiki/List_of_highest_points_of_Asian_countries'
webbrowser.open(website)

# 2.read the copied data
highest_points = pd.read_clipboard()
highest_points

In [None]:
# We can grab the column names with .columns
highest_points.columns

Index(['Rank', 'Country', 'Highest point', 'Elevation'], dtype='object')

In [None]:
# Lets see some specific data columns
DataFrame(highest_points,columns=['Rank','Highest point'])

Unnamed: 0,Rank,Highest point
0,6.0,Noshaq
1,,Aragats
2,,Mount Bazardüzü
3,44.0,Mountain of Smoke
4,40.0,Saka Haphong
5,4.0,Gangkhar Puensum
6,37.0,Pagon Hill
7,38.0,Phnom Aural
8,1.0,Mount Everest[1]
9,,Mount Olympus


In [None]:
DataFrame(highest_points,columns=['Rank','Country', 'State']) # if a column is missing pandas returns null values for it

Unnamed: 0,Rank,Country,State
0,6.0,Afghanistan,
1,,Armenia,
2,,Azerbaijan,
3,44.0,Bahrain,
4,40.0,Bangladesh,
5,4.0,Bhutan,
6,37.0,Brunei,
7,38.0,Cambodia,
8,1.0,China,
9,,Cyprus,


In [None]:
highest_points.Elevation # return specific column (as series)

0    7,492 m (24,580 ft)
1    4,090 m (13,419 ft)
2    4,466 m (14,652 ft)
3         122 m (400 ft)
4     1,052 m (3,451 ft)
5    7,570 m (24,836 ft)
6     1,850 m (6,070 ft)
7     1,810 m (5,938 ft)
8    8,848 m (29,029 ft)
9     1,951 m (6,401 ft)
Name: Elevation, dtype: object

In [None]:
# Or try this method for multiple word columns
highest_points['Highest point']

0               Noshaq
1              Aragats
2      Mount Bazardüzü
3    Mountain of Smoke
4         Saka Haphong
5     Gangkhar Puensum
6           Pagon Hill
7          Phnom Aural
8     Mount Everest[1]
9        Mount Olympus
Name: Highest point, dtype: object

In [None]:
highest_points.head(5) # get first 5 rows

Unnamed: 0,Rank,Country,Highest point,Elevation
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"


In [None]:
highest_points.tail(5) # get last 5 rows

Unnamed: 0,Rank,Country,Highest point,Elevation
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
highest_points.sample(5) #get 5 random rows from data

Unnamed: 0,Rank,Country,Highest point,Elevation
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)


In [None]:
highest_points.describe() # get some stats for the number columns of data

Unnamed: 0,Rank
count,7.0
mean,24.285714
std,19.465476
min,1.0
25%,5.0
50%,37.0
75%,39.0
max,44.0


In [None]:
highest_points.dtypes # get data type of each column

Rank             float64
Country           object
Highest point     object
Elevation         object
dtype: object

In [None]:
highest_points.shape # get the dataframe dimensions (rows,columns)

(10, 5)

####Inplace parameter

The inplace parameter specifies whether to change the object directly (True) or return a new modified copy (False).<br>
Let's see an example:<br>
The drop() method deletes a row/column.

In [None]:
example_df = DataFrame(np.arange(8).reshape(4,2), columns=['c1','c2']) # create a dataframe
example_df

Unnamed: 0,c1,c2
0,0,1
1,2,3
2,4,5
3,6,7


In [None]:
example_df.drop(1) # drop the second row

Unnamed: 0,c1,c2
0,0,1
2,4,5
3,6,7


In [None]:
example_df

Unnamed: 0,c1,c2
0,0,1
1,2,3
2,4,5
3,6,7


Note that the original dataframe remained unchanged!<br>
If we want to modify the original datagrame, we need to use inplace=True

In [None]:
example_df.drop(1, inplace=True) # drop the second row

In [None]:
example_df

Unnamed: 0,c1,c2
0,0,1
2,4,5
3,6,7


####Axis parameter

The axis parameter is used to specify along which axis an operation should be performed. <br>
axis=0 -> row wise -> operation on columns <br>
axis=1 -> column wise -> operation on rows <br>
Let's see some examples of using the axis parameter in the drop function and sum function:

In [None]:
example_df = DataFrame(np.arange(12).reshape(4,3), columns=['c1','c2','c3']) # create a dataframe
example_df

Unnamed: 0,c1,c2,c3
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [None]:
example_df.drop('c1', axis=1) # drop the c1 column (default- axis=0 -> drop rows)

Unnamed: 0,c2,c3
0,1,2
1,4,5
2,7,8
3,10,11


In [None]:
example_df

Unnamed: 0,c1,c2,c3
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [None]:
example_df.sum(axis=0) # sum row-wise (sum of each column)

c1    18
c2    22
c3    26
dtype: int64

In [None]:
example_df.sum(axis=1) # sum column-wise (sum of each row)

0     3
1    12
2    21
3    30
dtype: int64

####Create new column

In [None]:
highest_points['Top 5']="True" # create a column with value
highest_points

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_points['Top 5']="True" # create a column with value


Unnamed: 0,Rank,Country,Highest point,Elevation,Top 5
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)",True
1,,Armenia,Aragats,"4,090 m (13,419 ft)",True
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)",True
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft),True
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)",True
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)",True
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)",True
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)",True
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)",True
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)",True


In [None]:
highest_points["Top 5"] = np.arange(10) # changing the values of a column
highest_points

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_points["Top 5"] = np.arange(10) # changing the values of a column


Unnamed: 0,Rank,Country,Highest point,Elevation,Top 5
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)",0
1,,Armenia,Aragats,"4,090 m (13,419 ft)",1
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)",2
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft),3
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)",4
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)",5
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)",6
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)",7
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)",8
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)",9


In [None]:
# Adding a Series to a DataFrame
Top5 = Series([True]*5,index=[0,1,2,3,4])
highest_points['Top 5'] = Top5
highest_points

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highest_points['Top 5'] = Top5


Unnamed: 0,Rank,Country,Highest point,Elevation,Top 5
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)",True
1,,Armenia,Aragats,"4,090 m (13,419 ft)",True
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)",True
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft),True
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)",True
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)",
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)",
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)",
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)",
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)",


####Delete column / row

In [None]:
del highest_points["Top 5"] # remove a column (changes the original df)
highest_points

Unnamed: 0,Rank,Country,Highest point,Elevation
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
# we can also use the drop() method to remove a column (default- does not change the original df)
highest_points.drop("Country", axis=1)

Unnamed: 0,Rank,Highest point,Elevation
0,6.0,Noshaq,"7,492 m (24,580 ft)"
1,,Aragats,"4,090 m (13,419 ft)"
2,,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Mountain of Smoke,122 m (400 ft)
4,40.0,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
highest_points

Unnamed: 0,Rank,Country,Highest point,Elevation
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
highest_points.drop(0) # remove first row

Unnamed: 0,Rank,Country,Highest point,Elevation
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


####Rename columns and rows

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['CA', 'UK', 'IT'],
                    columns=['one', 'two', 'three', 'four']) # create a dataframe
data

Unnamed: 0,one,two,three,four
CA,0,1,2,3
UK,4,5,6,7
IT,8,9,10,11


In [None]:
data.rename(index={'CA': 'ES'}, columns={'three': '3'}) # replace specific indexes and columns

Unnamed: 0,one,two,3,four
ES,0,1,2,3
UK,4,5,6,7
IT,8,9,10,11


In [None]:
data # we can see that the original dataframe is unchanged

Unnamed: 0,one,two,three,four
CA,0,1,2,3
UK,4,5,6,7
IT,8,9,10,11


In [None]:
data.rename(index={'CA': 'ES'}, inplace=True) # use the inplace parameter to save changes
data

Unnamed: 0,one,two,three,four
ES,0,1,2,3
UK,4,5,6,7
IT,8,9,10,11


In [None]:
data.rename(index=str.title, columns=str.upper) # convert indexes of rows and columns based on given functions

Unnamed: 0,ONE,TWO,THREE,FOUR
Es,0,1,2,3
Uk,4,5,6,7
It,8,9,10,11


In [None]:
# instead of using the "index" and "column" parameters- we can use the "axis" parameter
data.rename({'one':'1','two':'2','three': '3','four':'4'}, axis=1) # rename columns

Unnamed: 0,1,2,3,4
ES,0,1,2,3
UK,4,5,6,7
IT,8,9,10,11


In [None]:
data.rename({'ES':'e','UK':'u','IT': 'i'}, axis=0) # rename index

Unnamed: 0,one,two,three,four
e,0,1,2,3
u,4,5,6,7
i,8,9,10,11


####Create dataframe from dictionary

In [None]:
# create dataframe from multiple dictionaries of equal length lists
raw_data = {'Country':['Cayman Islands','Tunisia','Liberia', 'Guam'],
        'Code':[345,216,231,671]}

calling_codes = DataFrame(raw_data)
calling_codes

Unnamed: 0,Country,Code
0,Cayman Islands,345
1,Tunisia,216
2,Liberia,231
3,Guam,671


In [None]:
# For full list of ways to create DataFrames from various sources go to the documentation for pandas:
website = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.html'
webbrowser.open(website)

## Working with Series and DataFrames

### Indexing

In [None]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

####Series indexing

In [None]:
idx_series = Series(np.array([10,20,30,40]),index=['R1','R2','R3', 'R4']) # create series
idx_series

R1    10
R2    20
R3    30
R4    40
dtype: int64

In [None]:
idx_series['R1'] # get value by index name

10

In [None]:
idx_series[0] # get value by index (position)

10

In [None]:
idx_series[0:3] # get value by index range

R1    10
R2    20
R3    30
dtype: int64

In [None]:
idx_series[[0,2]] # get value by list of indexes

R1    10
R3    30
dtype: int64

In [None]:
idx_series[['R1','R2']] # get value by list of index names

R1    10
R2    20
dtype: int64

####DataFrame indexing

In [None]:
idx_df = DataFrame(np.arange(25).reshape((5,5)),index=['IT','CA','ES','UK','MY'],columns=['C1','C2','C3','C4','C5']) # create dataframe
idx_df

Unnamed: 0,C1,C2,C3,C4,C5
IT,0,1,2,3,4
CA,5,6,7,8,9
ES,10,11,12,13,14
UK,15,16,17,18,19
MY,20,21,22,23,24


In [None]:
idx_df['C5'] # get a column

IT     4
CA     9
ES    14
UK    19
MY    24
Name: C5, dtype: int64

In [None]:
idx_df[['C5']] # get a column as df and not as a series

Unnamed: 0,C5
IT,4
CA,9
ES,14
UK,19
MY,24


In [None]:
idx_df[['C1','C3']] # get multiple columns (by list of column names)

Unnamed: 0,C1,C3
IT,0,2
CA,5,7
ES,10,12
UK,15,17
MY,20,22


Use iloc and loc to get specific rows and columns from a dataframe.<br>
iloc- integer indexing (position) <br>
loc- label indexing (names)

In [None]:
highest_points

Unnamed: 0,Rank,Country,Highest point,Elevation
0,6.0,Afghanistan,Noshaq,"7,492 m (24,580 ft)"
1,,Armenia,Aragats,"4,090 m (13,419 ft)"
2,,Azerbaijan,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"
5,4.0,Bhutan,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,37.0,Brunei,Pagon Hill,"1,850 m (6,070 ft)"
7,38.0,Cambodia,Phnom Aural,"1,810 m (5,938 ft)"
8,1.0,China,Mount Everest[1],"8,848 m (29,029 ft)"
9,,Cyprus,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
highest_points.iloc[3] # get a single row by index position

Rank                          44.0
Country                    Bahrain
Highest point    Mountain of Smoke
Elevation           122 m (400 ft)
Name: 3, dtype: object

In [None]:
highest_points.iloc[3,:] # equivalent to the above example (':' = all columns)

Rank                          44.0
Country                    Bahrain
Highest point    Mountain of Smoke
Elevation           122 m (400 ft)
Name: 3, dtype: object

In [None]:
highest_points.iloc[[3]] # double brackets will return a dataframe instead of a series

Unnamed: 0,Rank,Country,Highest point,Elevation
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)


In [None]:
highest_points.iloc[3:5] # get multiple rows by range of index position

Unnamed: 0,Rank,Country,Highest point,Elevation
3,44.0,Bahrain,Mountain of Smoke,122 m (400 ft)
4,40.0,Bangladesh,Saka Haphong,"1,052 m (3,451 ft)"


In [None]:
highest_points.iloc[[0,9],:] # get multiple rows by list of indexe positions

In [None]:
highest_points.iloc[3,2] # get a single value [row,column]

'Mountain of Smoke'

In [None]:
highest_points.iloc[3:5,0:2] # get values of specific rows and columns

Unnamed: 0,Rank,Country
3,44.0,Bahrain
4,40.0,Bangladesh


In [None]:
highest_points.iloc[:,[2,3]] # get all rows and specific columns

Unnamed: 0,Highest point,Elevation
0,Noshaq,"7,492 m (24,580 ft)"
1,Aragats,"4,090 m (13,419 ft)"
2,Mount Bazardüzü,"4,466 m (14,652 ft)"
3,Mountain of Smoke,122 m (400 ft)
4,Saka Haphong,"1,052 m (3,451 ft)"
5,Gangkhar Puensum,"7,570 m (24,836 ft)"
6,Pagon Hill,"1,850 m (6,070 ft)"
7,Phnom Aural,"1,810 m (5,938 ft)"
8,Mount Everest[1],"8,848 m (29,029 ft)"
9,Mount Olympus,"1,951 m (6,401 ft)"


In [None]:
random_df = DataFrame(np.arange(25).reshape((5,5)),index=['IT','CA','ES','UK','MY'],columns=['C1','C2','C3','C4','C5']) # create dataframe
random_df

Unnamed: 0,C1,C2,C3,C4,C5
IT,0,1,2,3,4
CA,5,6,7,8,9
ES,10,11,12,13,14
UK,15,16,17,18,19
MY,20,21,22,23,24


In [None]:
random_df.loc['CA'] # get values by index name

C1    5
C2    6
C3    7
C4    8
C5    9
Name: CA, dtype: int64

In [None]:
random_df.loc[['CA','IT']] # get values for multiple index

Unnamed: 0,C1,C2,C3,C4,C5
CA,5,6,7,8,9
IT,0,1,2,3,4


In [None]:
random_df.loc['CA','C2'] # get a single value by row name and column name

6

In [None]:
random_df.loc[:,'C3'] # get all rows of a single column

IT     2
CA     7
ES    12
UK    17
MY    22
Name: C3, dtype: int64

In [None]:
random_df.loc[['ES','UK'],['C2','C3','C4']] # get multiple rows and columns

Unnamed: 0,C2,C3,C4
ES,11,12,13
UK,16,17,18


###Reindexing<br>
We can't change individual index values, but we can reindex our series/dataframe.

In [None]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn, randint # to get random numbers

In [None]:
sample_series = Series(['cat', 'bat', 'rat', 'mat'],index=['R1','R2','R3','R4']) # create a series
sample_series

R1    cat
R2    bat
R3    rat
R4    mat
dtype: object

In [None]:
sample_index = sample_series.index # get the indexes
sample_index

Index(['R1', 'R2', 'R3', 'R4'], dtype='object')

In [None]:
print(sample_index[0])
sample_index[0] = 'a1' # trying to rename an index will throw error as index are immutable

R1


TypeError: Index does not support mutable operations

####Reindex series

In [None]:
# create series
series_test1 = Series(np.random.randint(1,10,4),index=['R1','R2','R3','R4'])
series_test1

R1    3
R2    5
R3    3
R4    2
dtype: int64

In [None]:
series_test1.reindex(['R4','R2','R1','R3']) # change index order

R4    2
R2    5
R1    3
R3    3
dtype: int64

In [None]:
# Call reindex to rearrange the data to a new index
series_test2 = series_test1.reindex(['R1','R2','R3','R4','R5','R6'])
series_test2

R1    3.0
R2    5.0
R3    3.0
R4    2.0
R5    NaN
R6    NaN
dtype: float64

####Reindex dataframe

In [None]:
# create dataframe with random values
random_df = DataFrame(randn(25).reshape((5,5)),index=['R1','R2','R4','R5', 'R6'],columns=['col1','col2','col3','col4','col5'])
random_df

Unnamed: 0,col1,col2,col3,col4,col5
R1,-1.412056,0.727759,-1.57573,0.874166,-1.218184
R2,-1.223686,1.175178,-0.481563,1.160548,-1.124018
R4,-0.187984,-0.990435,0.527858,-1.007515,0.875331
R5,-0.903443,0.94477,-0.737501,0.593536,1.283798
R6,0.13162,2.310225,-0.606795,0.308304,0.860939


In [None]:
random_df2 = random_df.reindex(['R1','R2','R3','R4', 'R5', 'R6']) # Call reindex to rearrange the data to a new index
random_df2

Unnamed: 0,col1,col2,col3,col4,col5
R1,-1.412056,0.727759,-1.57573,0.874166,-1.218184
R2,-1.223686,1.175178,-0.481563,1.160548,-1.124018
R3,,,,,
R4,-0.187984,-0.990435,0.527858,-1.007515,0.875331
R5,-0.903443,0.94477,-0.737501,0.593536,1.283798
R6,0.13162,2.310225,-0.606795,0.308304,0.860939


In [None]:
new_columns = ['col1','col2','col3','col4','col5','col6'] # rearrange columns index
random_df2.reindex(new_columns, axis=1)

Unnamed: 0,col1,col2,col3,col4,col5,col6
R1,-1.412056,0.727759,-1.57573,0.874166,-1.218184,
R2,-1.223686,1.175178,-0.481563,1.160548,-1.124018,
R3,,,,,,
R4,-0.187984,-0.990435,0.527858,-1.007515,0.875331,
R5,-0.903443,0.94477,-0.737501,0.593536,1.283798,
R6,0.13162,2.310225,-0.606795,0.308304,0.860939,


The reset_index method is used to reset the index of a DataFrame into default integer-based indexing. <br>
It is often used after operations that modify the structure of the DataFrame, such as grouping or filtering, which may result in a DataFrame with a new hierarchical index.

In [None]:
original_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, index=['row1', 'row2', 'row3']) # create a dataframe
original_df


Unnamed: 0,A,B,C
row1,1,4,7
row2,2,5,8
row3,3,6,9


In [None]:
original_df.reset_index() # Reset the index

Unnamed: 0,index,A,B,C
0,row1,1,4,7
1,row2,2,5,8
2,row3,3,6,9


In [None]:
original_df.reset_index(drop=True) # use the drop parameter to avoid the old index being added as a column

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


### Filtering

#### Filtering Series

In [None]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [None]:
random_series = Series(np.arange(4),index=['R1','R2','R3', 'R4']) # create series
random_series

R1    0
R2    1
R3    2
R4    3
dtype: int64

In [None]:
random_series[random_series >= 2] # get values that statisfy a condition

R3    2
R4    3
dtype: int64

In [None]:
random_series[(random_series >= 1) & (random_series < 4)] # get values that satisfy 2 conditions

R2    1
R3    2
dtype: int64

In [None]:
random_series[random_series > 2] = 10 # update values for rows that satisfy a condition
random_series

R1     0
R2     1
R3     2
R4    10
dtype: int64

In [None]:
cars_prod = Series([7000,3000,4000,5000,4000],index=['Ford','Lexus','Volvo','Chervolet','Kia']) # create series
cars_prod

Ford         7000
Lexus        3000
Volvo        4000
Chervolet    5000
Kia          4000
dtype: int64

In [None]:
cars_prod.where(cars_prod > 5000) # filter with where (returns nan for values that do not satisfy the condition)

Ford         7000.0
Lexus           NaN
Volvo           NaN
Chervolet       NaN
Kia             NaN
dtype: float64

In [None]:
cars_prod.where(cars_prod > 5000, 0) # filter with where and specifying a value to assign instead of nan

Ford         7000
Lexus           0
Volvo           0
Chervolet       0
Kia             0
dtype: int64

In [None]:
car_order = ['Chervolet','Ford','Kia','Lexus', 'Bentley'] # using list of keys to order and filter series

In [None]:
cars_prod2 = Series(cars_prod,index=car_order) # New series based on existing series
cars_prod2

Chervolet    5000.0
Ford         7000.0
Kia          4000.0
Lexus        3000.0
Bentley         NaN
dtype: float64

####Filtering dataframe

In [None]:
random_df = DataFrame(np.arange(25).reshape((5,5)),index=['IT','CA','ES','UK','MY'],columns=['C1','C2','C3','C4','C5']) # create dataframe
random_df

Unnamed: 0,C1,C2,C3,C4,C5
IT,0,1,2,3,4
CA,5,6,7,8,9
ES,10,11,12,13,14
UK,15,16,17,18,19
MY,20,21,22,23,24


In [None]:
random_df[random_df['C2'] > 5] # get values by a condition

Unnamed: 0,C1,C2,C3,C4,C5
CA,5,6,7,8,9
ES,10,11,12,13,14
UK,15,16,17,18,19
MY,20,21,22,23,24


In [None]:
random_df[(random_df['C2'] > 5) & (random_df['C5'] < 20)] # get values by 2 conditions

Unnamed: 0,C1,C2,C3,C4,C5
CA,5,6,7,8,9
ES,10,11,12,13,14
UK,15,16,17,18,19


In [None]:
random_df % 2 == 0 # check which values satisfy a given condition

Unnamed: 0,C1,C2,C3,C4,C5
IT,True,False,True,False,True
CA,False,True,False,True,False
ES,True,False,True,False,True
UK,False,True,False,True,False
MY,True,False,True,False,True


In [None]:
random_df.where(random_df % 2 == 0, -1) # filtering with where

Unnamed: 0,C1,C2,C3,C4,C5
IT,0,-1,2,-1,4
CA,-1,6,-1,8,-1
ES,10,-1,12,-1,14
UK,-1,16,-1,18,-1
MY,20,-1,22,-1,24


### Sorting and ranking

In [None]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd
from numpy.random import randn

####Sort and rank Series

In [None]:
pseries = Series([9, 3, 7,1],index=['P1','P3','P0','P2']) # create series
pseries

P1    9
P3    3
P0    7
P2    1
dtype: int64

In [None]:
pseries.sort_index() # sort by index- using built-in function

P0    7
P1    9
P2    1
P3    3
dtype: int64

In [None]:
pseries.sort_index(ascending=False) # sort by index in descending order

P3    3
P2    1
P1    9
P0    7
dtype: int64

In [None]:
pseries # note the original series kept unchanged

P1    9
P3    3
P0    7
P2    1
dtype: int64

In [None]:
pseries.sort_index(ascending=False, inplace=True) # use the inplace parameter to modify the original series
pseries

P3    3
P2    1
P1    9
P0    7
dtype: int64

In [None]:
pseries.sort_values() # order by value

P2    1
P3    3
P0    7
P1    9
dtype: int64

In [None]:
pseries.sort_values(ascending=False) # order by value - descending

P1    9
P0    7
P3    3
P2    1
dtype: int64

In [None]:
rand_series = Series(randn(10)) # create series
rand_series

0    0.157500
1   -1.206131
2    1.380485
3   -0.123302
4   -1.266532
5    0.882462
6    0.399715
7   -1.788011
8    0.343154
9    0.026532
dtype: float64

In [None]:
rand_series.rank() # assign rank to each value based on low to high value

0     6.0
1     3.0
2    10.0
3     4.0
4     2.0
5     9.0
6     8.0
7     1.0
8     7.0
9     5.0
dtype: float64

In [None]:
sorted_series = rand_series.sort_values() # assign sorted series to a new variable (note that the original series is unchanged)
sorted_series

7   -1.788011
4   -1.266532
1   -1.206131
3   -0.123302
9    0.026532
0    0.157500
8    0.343154
6    0.399715
5    0.882462
2    1.380485
dtype: float64

In [None]:
sorted_series.rank() # rank sorted series

7     1.0
4     2.0
1     3.0
3     4.0
9     5.0
0     6.0
8     7.0
6     8.0
5     9.0
2    10.0
dtype: float64

In [None]:
sorted_series.rank(ascending=False) # rank descending

7    10.0
4     9.0
1     8.0
3     7.0
9     6.0
0     5.0
8     4.0
6     3.0
5     2.0
2     1.0
dtype: float64

####Sort and rank DataFrames

In [None]:
df = pd.DataFrame({'B': [4, 2, 3], 'A': [1, 6, 1], 'C': [5, 8, 1]}, index=['row2', 'row1', 'row3'])
df

Unnamed: 0,B,A,C
row2,4,1,5
row1,2,6,8
row3,3,1,1


In [None]:
sorted_rows = df.sort_index() # sort rows
sorted_rows

Unnamed: 0,B,A,C
row1,2,6,8
row2,4,1,5
row3,3,1,1


In [None]:
df.sort_index(axis=1, inplace=True) # sort columns (axis=1), change original df
df

Unnamed: 0,A,B,C
row2,1,4,5
row1,6,2,8
row3,1,3,1


In [None]:
sorted_columns_desc = df.sort_index(axis=1, ascending=False) # sort columns in descending order
sorted_columns_desc

Unnamed: 0,C,B,A
row2,5,4,1
row1,8,2,6
row3,1,3,1


In [None]:
df.sort_values(by='A') # sort rows by a single column

Unnamed: 0,A,B,C
row2,1,4,5
row3,1,3,1
row1,6,2,8


In [None]:
df.sort_values(by=['A','B']) # sort rows by multiple column

Unnamed: 0,A,B,C
row3,1,3,1
row2,1,4,5
row1,6,2,8


In [None]:
df.sort_values(by='C', ascending=False) # sort rows by a single column- descending

Unnamed: 0,A,B,C
row1,6,2,8
row2,1,4,5
row3,1,3,1


In [None]:
df

Unnamed: 0,A,B,C
row2,1,4,5
row1,6,2,8
row3,1,3,1


In [None]:
df.sort_values(by='row1',axis=1) # sort columns by a single row

Unnamed: 0,B,A,C
row2,4,1,5
row1,2,6,8
row3,3,1,1


In [None]:
df.sort_values(by='row1',ascending=False, axis=1) # sort columns by a single row- descending

Unnamed: 0,C,A,B
row2,5,1,4
row1,8,6,2
row3,1,1,3


In [None]:
df

Unnamed: 0,A,B,C
row2,1,4,5
row1,6,2,8
row3,1,3,1


In [None]:
df.rank() # rank values in every column

Unnamed: 0,A,B,C
row2,1.5,3.0,2.0
row1,3.0,1.0,3.0
row3,1.5,2.0,1.0


In [None]:
df.rank(axis=1) # rank values in every row

Unnamed: 0,A,B,C
row2,1.0,2.0,3.0
row1,2.0,1.0,3.0
row3,1.5,3.0,1.5


In [None]:
df.rank(method='min') # the method parameter defines how to deal with ties (dedault- average)

Unnamed: 0,A,B,C
row2,1.0,3.0,2.0
row1,3.0,1.0,3.0
row3,1.0,2.0,1.0


In [None]:
df.rank(method='first') # the method parameter defines how to deal with ties

Unnamed: 0,A,B,C
row2,1.0,3.0,2.0
row1,3.0,1.0,3.0
row3,2.0,2.0,1.0


### Aggregation functions

In [None]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [None]:
series1 = Series([1,3,2,4,1,3,2,1]) # create series
series1

0    1
1    3
2    2
3    4
4    1
5    3
6    2
7    1
dtype: int64

In [None]:
ndarr = np.array([[20,30,10],[np.nan,50,40]]) # create 2D array
df1 = DataFrame(ndarr,index=['P1','P2'],columns = ['C1','C2','C3']) # create dataframe from array
df1

Unnamed: 0,C1,C2,C3
P1,20.0,30.0,10.0
P2,,50.0,40.0


In [None]:
series1.sum() # sum all values in series

17

In [None]:
df1.sum() # sum all values in df row-wise

C1    20.0
C2    80.0
C3    50.0
dtype: float64

In [None]:
df1.sum(axis=1) # sum all values in df column-wise

P1    60.0
P2    90.0
dtype: float64

In [None]:
series1.mean() # average value of series

2.125

In [None]:
df1.mean(axis=1) # average value of df, column wise

P1    20.0
P2    45.0
dtype: float64

In [None]:
series1.min() # min value of series

1

In [None]:
df1.max() # max value of df, row-wise

C1    20.0
C2    50.0
C3    40.0
dtype: float64

In [None]:
series1.count() # get number of items in series

8

In [None]:
df1.count() # number of non-null values in each column

C1    1
C2    2
C3    2
dtype: int64

In [None]:
df1.count(axis=1) # number of non-null values in each row

P1    3
P2    2
dtype: int64

In [None]:
series1.value_counts() # get frequency of each value in series

1    3
3    2
2    2
4    1
dtype: int64

In [None]:
df1['C2'].value_counts() # get frequency of each value in a dataframe column

30.0    1
50.0    1
Name: C2, dtype: int64

In [None]:
series1.idxmax() # index of the maximal value in series

3

In [None]:
df1.idxmin() # index of the minimal value in dataframe, row-wise

C1    P1
C2    P1
C3    P1
dtype: object

In [None]:
series1.unique() # get unique values of series

array([1, 3, 2, 4])

In [None]:
series1.cumsum() # colmulative sum of series

0     1
1     4
2     6
3    10
4    11
5    14
6    16
7    17
dtype: int64

In [None]:
df1.cumsum(axis=1) # cumulative sum of a dataframe, column-wise

Unnamed: 0,C1,C2,C3
P1,20.0,50.0,60.0
P2,,50.0,90.0


The apply function is used to perform an operation on each value of a series or each row/column of a dataframe.<br>
You can apply built-in functions or custom functions (def, lambda).

In [None]:
series2 = Series(np.array([1,4,9,16,15]))
series2

0     1
1     4
2     9
3    16
4    15
dtype: int64

In [None]:
series2.apply(np.sqrt) # apply built-in function to every element

0    1.000000
1    2.000000
2    3.000000
3    4.000000
4    3.872983
dtype: float64

In [None]:
def double(x):
    return x * 2

series2.apply(double) # apply custom function

0     2
1     8
2    18
3    32
4    30
dtype: int64

In [None]:
series2.apply(lambda x: x ** 2) # apply lambda function

0      1
1     16
2     81
3    256
4    225
dtype: int64

In [None]:
df2 = pd.DataFrame({'col1': np.random.randint(1,5,10),
                    'col2': np.random.randint(1,5,10),}) # create dataframe with random values
df2

Unnamed: 0,col1,col2
0,1,1
1,2,4
2,2,4
3,4,2
4,1,3
5,4,4
6,3,1
7,2,3
8,3,4
9,4,4


In [None]:
sum_col = df2.apply(np.sum) # apply built-in function on columns
sum_col

col1    26
col2    30
dtype: int64

In [None]:
sum_row = df2.apply(np.mean, axis=1) # apply built-in function on rows
sum_row

0    1.0
1    3.0
2    3.0
3    3.0
4    2.0
5    4.0
6    2.0
7    2.5
8    3.5
9    4.0
dtype: float64

In [None]:
column_value_counts = df2.apply(lambda col: col.value_counts()) # apply lambda function on each column
column_value_counts

Unnamed: 0,col1,col2
1,2,2
2,3,1
3,2,2
4,3,5
