In [1]:
# format of pandas is similar to this
covid_d = {
    'date' : ['2020-01-24', '2023-05-27'],
    'new_cases' : [ 1444, 3554 ],
    'new_tests' : [ 53541 , 425683 , None , None]
    }

In [2]:
covid_d

{'date': ['2020-01-24', '2023-05-27'],
 'new_cases': [1444, 3554],
 'new_tests': [53541, 425683, None, None]}

In [3]:
type(covid_d)

dict

In [4]:
type(covid_d['date'])

list

In [5]:
type(covid_d['new_tests'])

list

#### Importing a CSV file using Pandas
hint: similar but not the same as numpy

numpy - genfromtxt(text file name, delimiter , skip header = 1)   
- create np arrays from tabular data

pandas - read_csv(filename)   
- reads data from a CSV file into a Pandas `Dataframe` object



urlretrieve( url , filename where you want to save the csv data)

In [2]:
import pandas as pd
from urllib.request import urlretrieve


 summary of the functions & methods we looked at in this section:


* `pd.read_csv` - Read data from a CSV file into a Pandas `DataFrame` object
* `.info()` - View basic infomation about rows, columns & data types
* `.describe()` - View statistical information about numeric columns
* `.columns` - Get the list of column names
* `.shape` - Get the number of rows & columns as a tuple


- `covid_df['new_cases']` - Retrieving columns as a `Series` using the column name
- `new_cases[243]` - Retrieving values from a `Series` using an index
- `covid_df.at[243, 'new_cases']` - Retrieving a single value from a data frame
- `covid_df.copy()` - Creating a deep copy of a data frame
- `covid_df.loc[243]` - Retrieving a row or range of rows of data from the data frame
- `head`, `tail`, and `sample` - Retrieving multiple rows of data from the data frame
- `covid_df.new_tests.first_valid_index` - Finding the first non-empty index in a series


In [3]:
italy_covid_url = 'https://gist.githubusercontent.com/aakashns/f6a004fa20c84fec53262f9a8bfee775/raw/f309558b1cf5103424cef58e2ecb8704dcd4d74c/italy-covid-daywise.csv'

urlretrieve(italy_covid_url, "italy_covid_daywise.csv")

('italy_covid_daywise.csv', <http.client.HTTPMessage at 0x1d8160a3940>)

In [4]:
# data from a file is read and stored in df object - 
# df is the core data structure in pandas for storing and working with tabular data
covid_df = pd.read_csv('italy_covid_daywise.csv')

In [5]:
type(covid_df)

pandas.core.frame.DataFrame

In [6]:
# basic info about the df object
covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float64
 2   new_deaths  248 non-null    float64
 3   new_tests   135 non-null    float64
dtypes: float64(3), object(1)
memory usage: 7.9+ KB


In [7]:
type(covid_df)

pandas.core.frame.DataFrame

In [8]:
covid_df.describe() # to describe statistical data

Unnamed: 0,new_cases,new_deaths,new_tests
count,248.0,248.0,135.0
mean,1094.818548,143.133065,31699.674074
std,1554.508002,227.105538,11622.209757
min,-148.0,-31.0,7841.0
25%,123.0,3.0,25259.0
50%,342.0,17.0,29545.0
75%,1371.75,175.25,37711.0
max,6557.0,971.0,95273.0


In [9]:
covid_df.columns # list of columns

Index(['date', 'new_cases', 'new_deaths', 'new_tests'], dtype='object')

In [10]:
# wrong - covid_df.shape()

covid_df.shape

(248, 4)

In [11]:
# To access a column
# same as 
# covid_df['new_cases']
covid_df.new_tests

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
243    53541.0
244    42583.0
245    54395.0
246        NaN
247        NaN
Name: new_tests, Length: 248, dtype: float64

- Each column is represented using `Series` data structure

pandas.core.series.Series

In [12]:
type(covid_df.new_cases)

pandas.core.series.Series

#### To retrieve a the element at a specific row and column directly

In [13]:
covid_df['new_cases'][234]

840.0

In [14]:
# df.at[row,col]
# wrong-use sqr brackets - covid_df.at(234 , 'new_cases')

covid_df.at[234 , 'new_cases']

840.0

In [15]:
# to access the subset of a dataframe as a "view"
# new_df = old_df[['col1','col2',...]]

covid_copy = covid_df[['date','new_tests']]
covid_copy

Unnamed: 0,date,new_tests
0,2019-12-31,
1,2020-01-01,
2,2020-01-02,
3,2020-01-03,
4,2020-01-04,
...,...,...
243,2020-08-30,53541.0
244,2020-08-31,42583.0
245,2020-09-01,54395.0
246,2020-09-02,


In [16]:
# to copy a df from one to another , use .copy() method
new_df = covid_df.copy()
new_df



Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
...,...,...,...,...
243,2020-08-30,1444.0,1.0,53541.0
244,2020-08-31,1365.0,4.0,42583.0
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,


#### To access a specific row of data
use .loc[rowNo.]

each retireved row is also a `Series` object


In [17]:
covid_df.loc[244]

date          2020-08-31
new_cases           1365
new_deaths             4
new_tests          42583
Name: 244, dtype: object

In [18]:
type(covid_df.loc[244])

pandas.core.series.Series

In [19]:
(covid_df.head(2))

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,


In [20]:
covid_df.tail(3)

Unnamed: 0,date,new_cases,new_deaths,new_tests
245,2020-09-01,996.0,6.0,54395.0
246,2020-09-02,975.0,8.0,
247,2020-09-03,1326.0,6.0,


In [21]:
type(covid_df.at[234,"new_tests"])

numpy.float64

In [22]:
covid_df.at[0,"new_tests"]

nan

In [23]:
covid_df[2:9]

Unnamed: 0,date,new_cases,new_deaths,new_tests
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
5,2020-01-05,0.0,0.0,
6,2020-01-06,0.0,0.0,
7,2020-01-07,0.0,0.0,
8,2020-01-08,0.0,0.0,


In [24]:
covid_df.loc[2:5]

Unnamed: 0,date,new_cases,new_deaths,new_tests
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,
5,2020-01-05,0.0,0.0,


In [25]:
# to retrieve a random sample of rows
covid_df.sample(3)

Unnamed: 0,date,new_cases,new_deaths,new_tests
155,2020-06-03,318.0,55.0,20035.0
180,2020-06-28,175.0,8.0,21183.0
204,2020-07-22,128.0,15.0,29288.0


In [26]:
covid_df.describe()



Unnamed: 0,new_cases,new_deaths,new_tests
count,248.0,248.0,135.0
mean,1094.818548,143.133065,31699.674074
std,1554.508002,227.105538,11622.209757
min,-148.0,-31.0,7841.0
25%,123.0,3.0,25259.0
50%,342.0,17.0,29545.0
75%,1371.75,175.25,37711.0
max,6557.0,971.0,95273.0


In [30]:
# total number of reported deaths and cases in covid 19
total_cases = covid_df.new_deaths.sum()
total_cases

35497.0

In [31]:
# print('{:.2f}% of tests in Italy led to a positive diagnosis.'.format(positive_rate*100))

# To print only two digits in the answer - {:.2f}


#### Querying and sorting rows


- `covid_df.new_cases.sum()` - Computing the sum of values in a column or series
- `covid_df[covid_df.new_cases > 1000]` - Querying a subset of rows satisfying the chosen criteria using boolean expressions
- `df['pos_rate'] = df.new_cases/df.new_tests` - Adding new columns by combining data from existing columns
- `covid_df.drop('positive_rate')` - Removing one or more columns from the data frame
- `sort_values` - Sorting the rows of a data frame using column values
- `covid_df.at[172, 'new_cases'] = ...` - Replacing a value within the data frame
