# Pandas Basics

In [1]:
import pandas as pd
import numpy as np

## What Is DataFrame? How To Create It?

In [2]:
data_dict = {
    'car_name': ['pride', 'peykan', 'bmw', 'lexus', 'cerato', 'l90'],
    'price': [100, 50, 2000, 1600, 1200, 900],
    'quality': [-1, -2, 5, 4, 3, 2]
}

df = pd.DataFrame(data_dict)
df  # index is automatically generated

Unnamed: 0,car_name,price,quality
0,pride,100,-1
1,peykan,50,-2
2,bmw,2000,5
3,lexus,1600,4
4,cerato,1200,3
5,l90,900,2


In [5]:
car_names = ['pride', 'peykan', 'bmw', 'lexus', 'cerato', 'l90']
price = [100, 50, 2000, 1600, 1200, 900]
quality = [-1, -2, 5, 4, 3, 2]
# data_list = list(zip(car_names, price, quality))
data_list = list(zip(car_names, price, quality))
print(data_list)

df = pd.DataFrame(data_list)
df.head()  # auto-generated index and improper column names

[('pride', 100, -1), ('peykan', 50, -2), ('bmw', 2000, 5), ('lexus', 1600, 4), ('cerato', 1200, 3), ('l90', 900, 2)]


Unnamed: 0,0,1,2
0,pride,100,-1
1,peykan,50,-2
2,bmw,2000,5
3,lexus,1600,4
4,cerato,1200,3


In [12]:
df = pd.DataFrame(data_list, columns=['car_name', 'price', 'quality'])
df = df.set_index('car_name')
df.head(3)

Unnamed: 0_level_0,price,quality
car_name,Unnamed: 1_level_1,Unnamed: 2_level_1
pride,100,-1
peykan,50,-2
bmw,2000,5


## Pandas Vs. Numpy

In [14]:
price_np = np.array(price)
print(price_np)     # No index

df['price']         # Each element has an index which is the name of a car
                    # This makes it much easier to work with data.

[ 100   50 2000 1600 1200  900]


car_name
pride      100
peykan      50
bmw       2000
lexus     1600
cerato    1200
l90        900
Name: price, dtype: int64

## Loading A Dataset

In [15]:
df = pd.read_excel('covid.xlsx')        # read_csv() for csv files
df.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.052776
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.868768
3,2020-12-11,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.134266
4,2020-12-10,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.968658


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61900 entries, 0 to 61899
Data columns (total 12 columns):
 #   Column                                                      Non-Null Count  Dtype         
---  ------                                                      --------------  -----         
 0   dateRep                                                     61900 non-null  datetime64[ns]
 1   day                                                         61900 non-null  int64         
 2   month                                                       61900 non-null  int64         
 3   year                                                        61900 non-null  int64         
 4   cases                                                       61900 non-null  int64         
 5   deaths                                                      61900 non-null  int64         
 6   countriesAndTerritories                                     61900 non-null  object        
 7   geoId                 

## Selecting By Columns

In [23]:
# df.countriesAndTerritories
df['countriesAndTerritories']

0        Afghanistan
1        Afghanistan
2        Afghanistan
3        Afghanistan
4        Afghanistan
            ...     
61895       Zimbabwe
61896       Zimbabwe
61897       Zimbabwe
61898       Zimbabwe
61899       Zimbabwe
Name: countriesAndTerritories, Length: 61900, dtype: object

In [22]:
# selecting multiple columns
df_countries_cases = df[['countriesAndTerritories', 'cases', 'dateRep']]
df_countries_cases.tail()

Unnamed: 0,countriesAndTerritories,cases,dateRep
61895,Zimbabwe,0,2020-03-25
61896,Zimbabwe,0,2020-03-24
61897,Zimbabwe,0,2020-03-23
61898,Zimbabwe,1,2020-03-22
61899,Zimbabwe,1,2020-03-21


In [62]:
# selecting based on condition
df_iran = df[(df.countriesAndTerritories == 'Iran')]
print(df.countriesAndTerritories == 'Iran')
print('-----------------------------------')
df_iran

0        False
1        False
2        False
3        False
4        False
         ...  
61895    False
61896    False
61897    False
61898    False
61899    False
Name: countriesAndTerritories, Length: 61900, dtype: bool
-----------------------------------


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
27437,2020-12-14,14,12,2020,7451,247,Iran,IR,IRN,82913893.0,Asia,192.392365
27438,2020-12-13,13,12,2020,8201,221,Iran,IR,IRN,82913893.0,Asia,199.024547
27439,2020-12-12,12,12,2020,9594,232,Iran,IR,IRN,82913893.0,Asia,205.297320
27440,2020-12-11,11,12,2020,10403,284,Iran,IR,IRN,82913893.0,Asia,210.672776
27441,2020-12-10,10,12,2020,10223,295,Iran,IR,IRN,82913893.0,Asia,214.963975
...,...,...,...,...,...,...,...,...,...,...,...,...
27782,2020-01-04,4,1,2020,0,0,Iran,IR,IRN,82913893.0,Asia,
27783,2020-01-03,3,1,2020,0,0,Iran,IR,IRN,82913893.0,Asia,
27784,2020-01-02,2,1,2020,0,0,Iran,IR,IRN,82913893.0,Asia,
27785,2020-01-01,1,1,2020,0,0,Iran,IR,IRN,82913893.0,Asia,


## Selecting By Index/Label

In [28]:
# syntax: df.loc[row selector, column selector]
df_iran.loc[27437:27441, ]    # or, df_iran.loc[27437:27441, :], or df_iran.loc[27437:27441, ]

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
27440,2020-12-11,11,12,2020,10403,284,Iran,IR,IRN,82913893.0,Asia,210.672776
27441,2020-12-10,10,12,2020,10223,295,Iran,IR,IRN,82913893.0,Asia,214.963975


In [33]:
df_iran.loc[27437:27441, 'dateRep':'deaths']       # in Pandas, both ends of ranges are included in selection

Unnamed: 0,dateRep,day,month,year,cases,deaths
27440,2020-12-11,11,12,2020,10403,284
27441,2020-12-10,10,12,2020,10223,295


In [36]:
# these indices are a bit hard to work with. we can use numerical index
print(df_iran.iloc[:5, 1:4])
# print(df_iran.iloc[:5, ['day', 'month', 'year']])      # iloc only accepts numerical indices!!

       day  month  year
27440   11     12  2020
27441   10     12  2020
27442    9     12  2020
27443    8     12  2020
27444    7     12  2020


## df.where() vs. Conditional Selection

In [37]:
# Notice the size of the resulting dataframe
df_iran[df_iran.cases > 3000]


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
27440,2020-12-11,11,12,2020,10403,284,Iran,IR,IRN,82913893.0,Asia,210.672776
27441,2020-12-10,10,12,2020,10223,295,Iran,IR,IRN,82913893.0,Asia,214.963975
27442,2020-12-09,9,12,2020,11023,323,Iran,IR,IRN,82913893.0,Asia,235.878443
27443,2020-12-08,8,12,2020,10827,284,Iran,IR,IRN,82913893.0,Asia,222.583928
27444,2020-12-07,7,12,2020,11561,294,Iran,IR,IRN,82913893.0,Asia,224.553441
27445,2020-12-06,6,12,2020,12151,321,Iran,IR,IRN,82913893.0,Asia,226.352898
27446,2020-12-05,5,12,2020,13341,347,Iran,IR,IRN,82913893.0,Asia,227.293633
27447,2020-12-04,4,12,2020,13922,358,Iran,IR,IRN,82913893.0,Asia,243.143812
27448,2020-12-03,3,12,2020,13621,362,Iran,IR,IRN,82913893.0,Asia,226.352898
27449,2020-12-02,2,12,2020,13881,382,Iran,IR,IRN,82913893.0,Asia,226.111684


In [45]:
# df.where() preserves the size of dataframe. It fills every column of non-matching rows with 0.
df_iran.where(df_iran.cases > 13000, np.nan)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
27440,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27441,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27442,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27443,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27444,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27445,0,0,0,0,0,0,0,0,0,0.0,0,0.0
27446,2020-12-05 00:00:00,5,12,2020,13341,347,Iran,IR,IRN,82913893.0,Asia,227.293633
27447,2020-12-04 00:00:00,4,12,2020,13922,358,Iran,IR,IRN,82913893.0,Asia,243.143812
27448,2020-12-03 00:00:00,3,12,2020,13621,362,Iran,IR,IRN,82913893.0,Asia,226.352898
27449,2020-12-02 00:00:00,2,12,2020,13881,382,Iran,IR,IRN,82913893.0,Asia,226.111684


## Adding A New Column

In [50]:
# calculating death/cases ratio
df_iran['ratio'] = df_iran.deaths / df_iran.cases
df_iran.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_iran['ratio'] = df_iran.deaths / df_iran.cases


Unnamed: 0,dateRep,ratio_percent,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000,ratio
27440,2020-12-11,2.729982,11,12,2020,10403,284,Iran,IR,IRN,82913893.0,Asia,210.672776,0.0273
27441,2020-12-10,2.88565,10,12,2020,10223,295,Iran,IR,IRN,82913893.0,Asia,214.963975,0.028857
27442,2020-12-09,2.930237,9,12,2020,11023,323,Iran,IR,IRN,82913893.0,Asia,235.878443,0.029302
27443,2020-12-08,2.623072,8,12,2020,10827,284,Iran,IR,IRN,82913893.0,Asia,222.583928,0.026231
27444,2020-12-07,2.543033,7,12,2020,11561,294,Iran,IR,IRN,82913893.0,Asia,224.553441,0.02543


In [51]:
# method 2
df_iran.insert(1, 'ratio_percent', df_iran.ratio * 100)
df_iran.head()

ValueError: cannot insert ratio_percent, already exists

## Deleting A Column

In [57]:
df_iran.drop(columns=['ratio', 'ratio_percent'])
df_iran.head()  # nothing is dropped

df_iran.drop(columns=['ratio', 'ratio_percent'], inplace=True)      # Notice the inplace argument
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.052776
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.868768
3,2020-12-11,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.134266
4,2020-12-10,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.968658


## Renaming A Column

In [63]:
df_iran.rename(columns={
    'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'cum_num_14_per_100000'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


## Adding A Row

In [55]:
df_iran.head()

Unnamed: 0,dateRep,ratio_percent,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,cum_num_14_per_100000,ratio
27440,2020-12-11,2.729982,11,12,2020,10403,284,Iran,IR,IRN,82913893.0,Asia,210.672776,0.0273
27441,2020-12-10,2.88565,10,12,2020,10223,295,Iran,IR,IRN,82913893.0,Asia,214.963975,0.028857
27442,2020-12-09,2.930237,9,12,2020,11023,323,Iran,IR,IRN,82913893.0,Asia,235.878443,0.029302
27443,2020-12-08,2.623072,8,12,2020,10827,284,Iran,IR,IRN,82913893.0,Asia,222.583928,0.026231
27444,2020-12-07,2.543033,7,12,2020,11561,294,Iran,IR,IRN,82913893.0,Asia,224.553441,0.02543


In [58]:
df_iran.loc[len(df)] = ['2020-12-14 00:00:00',	14,	12,	2020,	7451,	247,	'Iran',	'IR',	'IRN',	82913893.0,	'Asia',	192.392365]
df_iran.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,cum_num_14_per_100000
27468,2020-11-13 00:00:00,13,11,2020,11517,457,Iran,IR,IRN,82913893.0,Asia,156.359803
27469,2020-11-12 00:00:00,12,11,2020,11780,462,Iran,IR,IRN,82913893.0,Asia,152.471432
27470,2020-11-11 00:00:00,11,11,2020,10339,453,Iran,IR,IRN,82913893.0,Asia,146.494147
27471,2020-11-10 00:00:00,10,11,2020,10463,458,Iran,IR,IRN,82913893.0,Asia,142.428483
61900,2020-12-14 00:00:00,14,12,2020,7451,247,Iran,IR,IRN,82913893.0,Asia,192.392365


In [59]:
df_iran.loc[61900]

dateRep                    2020-12-14 00:00:00
day                                         14
month                                       12
year                                      2020
cases                                     7451
deaths                                     247
countriesAndTerritories                   Iran
geoId                                       IR
countryterritoryCode                       IRN
popData2019                        8.29139e+07
continentExp                              Asia
cum_num_14_per_100000                  192.392
Name: 61900, dtype: object

## Saving A DataFrame

In [64]:
df_iran.to_csv('iran_covid.csv', index=False)

## Dealing With Categorical Data

In [None]:
df.dtypes

dateRep                                                       datetime64[ns]
day                                                                    int64
month                                                                  int64
year                                                                   int64
cases                                                                  int64
deaths                                                                 int64
countriesAndTerritories                                               object
geoId                                                                 object
countryterritoryCode                                                  object
popData2019                                                          float64
continentExp                                                          object
Cumulative_number_for_14_days_of_COVID-19_cases_per_100000           float64
dtype: object

In [None]:
df.memory_usage(deep=True).sum()/1e6    # Megabytes

19.311064

In [None]:
df.countriesAndTerritories = df.countriesAndTerritories.astype('category')
df.geoId = df.geoId.astype('category')
df.countryterritoryCode = df.countryterritoryCode.astype('category')
df.continentExp = df.continentExp.astype('category')
print(df.dtypes)
df.memory_usage(deep=True).sum()/1e6    # How much less memory is being occupied!!!!

dateRep                                                       datetime64[ns]
day                                                                    int64
month                                                                  int64
year                                                                   int64
cases                                                                  int64
deaths                                                                 int64
countriesAndTerritories                                             category
geoId                                                               category
countryterritoryCode                                                category
popData2019                                                          float64
continentExp                                                        category
Cumulative_number_for_14_days_of_COVID-19_cases_per_100000           float64
dtype: object


4.460027