In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('covid19.csv')
df.shape

In [None]:
# to look the first 5 rows of table 
df.head()

## Clean data

In [None]:
# to see how many null data in the table (in summation)
df.isnull().sum()

In [None]:
# delete rows that contain null data 
clean_df = df.dropna() # default axis=0(row)
clean_df.shape

In [None]:
col_clean_df = df.dropna(axis=1) # axis=1 earses the columns that contain null data
col_clean_df.shape

In [None]:
# if you wanna change in original dataframe use inplace=True (it forces to change the original datframe) 
df.dropna(axis=1, inplace=True) 
df.shape

In [None]:
df.head() # Province/State column was deleted

In [None]:
df.isnull().sum()

In [None]:
# build a test dataframe 
data = {
    'Product' : ['Milk', 'Tea', 'Milk'],
    'Price' : [500, 300, 500]
}

test_df = pd.DataFrame(data, index=['P001', 'P002', 'P003'])
test_df

In [None]:
# delete duplicated data
test_df.drop_duplicates()

## Filling null data

In [None]:
data = {
    'Product' : ['Milk', 'Tea', 'Milo', 'Bubble Tea'],
    'Price' : [500, None, 500, 1200]
}

tdf = pd.DataFrame(data)
tdf

In [None]:
# test is there any null value 
tdf.isnull().sum() # there is null value in Price column

In [None]:
# select Price column and define into a variable as Series (not df)
price_df = tdf['Price']
price_df

In [None]:
# use describe to see distributive summary 
price_df.describe()

In [None]:
# fill null value with mean value 
price_mean = price_df.mean()
price_mean

In [None]:
# use fillna to fill null value 
price_df.fillna(price_mean, inplace=True)

In [None]:
tdf

In [None]:
cv19df = df
cv19df.head()

In [None]:
cv19df.describe()

In [None]:
cv19df[['Confirmed']].describe()

In [None]:
# value count is used to count how many time that data exist in this table. It works only with Series (not df)
country = cv19df['Country/Region'].value_counts()
country

In [None]:
country.loc['US']

In [None]:
# conditional statement 
australia_df = cv19df[cv19df['Country/Region'] == 'Australia']
australia_df.isnull().sum()

In [None]:
# remove null 
australia_df.dropna()

In [None]:
# custom function 
def situation(x):
    if x >= 20:
        return 'bad'
    else:
        return 'normal'

cv19df['Severity'] = cv19df['Confirmed'].apply(situation)
cv19df.head(20)

In [53]:
# group by 
cv19df.groupby(['Country/Region']).describe()

Unnamed: 0_level_0,SNo,SNo,SNo,SNo,SNo,SNo,SNo,SNo,Confirmed,Confirmed,...,Deaths,Deaths,Recovered,Recovered,Recovered,Recovered,Recovered,Recovered,Recovered,Recovered
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Country/Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Azerbaijan,1.0,2664.000000,,2664.0,2664.00,2664.0,2664.00,2664.0,1.0,1.000000,...,0.00,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,0.0
"('St. Martin',)",1.0,4675.000000,,4675.0,4675.00,4675.0,4675.00,4675.0,1.0,2.000000,...,0.00,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,0.0
Afghanistan,26.0,4289.846154,1545.203875,2259.0,2943.75,4018.0,5496.00,7197.0,26.0,7.115385,...,0.00,0.0,26.0,0.192308,0.401918,0.0,0.0,0.0,0.0,1.0
Albania,12.0,5680.416667,904.846191,4403.0,4978.25,5613.0,6355.25,7158.0,12.0,38.250000,...,1.25,2.0,12.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Algeria,25.0,4329.400000,1499.876773,2353.0,3023.00,4070.0,5489.00,7138.0,25.0,26.040000,...,3.00,11.0,25.0,5.600000,9.380832,0.0,0.0,0.0,12.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela,7.0,6344.142857,576.159535,5595.0,5928.50,6306.0,6733.50,7184.0,7.0,26.000000,...,0.00,0.0,7.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Vietnam,58.0,2466.344828,1967.457916,80.0,894.00,1945.0,3599.00,7137.0,58.0,21.241379,...,0.00,0.0,58.0,9.206897,6.935185,0.0,1.0,10.5,16.0,16.0
Zambia,3.0,6980.333333,296.002252,6685.0,6832.00,6979.0,7128.00,7277.0,3.0,2.000000,...,0.00,0.0,3.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
Zimbabwe,1.0,7300.000000,,7300.0,7300.00,7300.0,7300.00,7300.0,1.0,1.000000,...,0.00,0.0,1.0,0.000000,,0.0,0.0,0.0,0.0,0.0


In [39]:
cv19df.sort_values('Confirmed', ascending=False)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Severity
6722,6723,03/19/2020,Hubei,Mainland China,2020-03-19T10:13:14,67800.0,3130.0,57682.0,bad
7014,7015,03/20/2020,Hubei,Mainland China,2020-03-20T07:43:02,67800.0,3133.0,58382.0,bad
6438,6439,03/18/2020,Hubei,Mainland China,2020-03-18T12:13:09,67800.0,3122.0,56927.0,bad
6162,6163,03/17/2020,Hubei,Mainland China,2020-03-17T11:53:10,67799.0,3111.0,56003.0,bad
5890,5891,03/16/2020,Hubei,Mainland China,2020-03-16T14:38:45,67798.0,3099.0,55142.0,bad
...,...,...,...,...,...,...,...,...,...
6433,6434,03/17/2020,,Guernsey,2020-03-17T18:33:03,0.0,0.0,0.0,normal
2582,2583,02/27/2020,"Lackland, TX (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,normal
2583,2584,02/27/2020,"Omaha, NE (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,normal
2584,2585,02/27/2020,"Travis, CA (From Diamond Princess)",US,2020-02-24T23:33:02,0.0,0.0,0.0,normal
