In [1]:
import pandas as pd

### Import Datasets

In [2]:
df = pd.read_csv("./other_data/covid19_indonesia.csv")

selected_columns = ["Date", "Location", "New Cases", "New Deaths", "New Recovered", "Longitude", "Latitude"]
covid_df = df[selected_columns]
covid_df.head()

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
0,3/1/2020,DKI Jakarta,2,0,0,106.836118,-6.204699
1,3/2/2020,DKI Jakarta,2,0,0,106.836118,-6.204699
2,3/2/2020,Indonesia,2,0,0,113.921327,-0.789275
3,3/2/2020,Riau,1,0,0,101.805109,0.511648
4,3/3/2020,DKI Jakarta,2,0,0,106.836118,-6.204699


### Dataset Cleaning

In [3]:
covid_df.describe(include="all")

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
count,31822,31822,31822.0,31822.0,31822.0,31822.0,31822.0
unique,930,35,,,,,
top,6/9/2021,DKI Jakarta,,,,,
freq,35,929,,,,,
mean,,,402.311388,9.920652,390.398498,113.700478,-2.725681
std,,,2320.629838,64.13908,2199.878802,9.862068,3.608065
min,,,0.0,0.0,0.0,96.910522,-8.682205
25%,,,3.0,0.0,2.0,106.109004,-6.204699
50%,,,27.0,0.0,20.0,113.417654,-2.461746
75%,,,130.0,3.0,123.0,121.201093,0.212037


In [4]:
covid_df[covid_df.isna().any(axis=1)].reset_index(drop=True)

# Replace NaN value with mean value
# value = covid_df["New Cases"].mean()
# covid_df["New Cases"].fillna(value, inplace=True)

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude


In [5]:
covid_df[covid_df.duplicated()]

# covid_df = covid_df.drop_duplicates().reset_index(drop=True)

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude


In [6]:
covid_df.loc[covid_df["New Cases"] == covid_df["New Cases"].max()]

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
24410,2/16/2022,Indonesia,64718,167,25386,113.921327,-0.789275


In [7]:
covid_df = covid_df.drop(covid_df[covid_df["Location"] == "Indonesia"].index, axis=0).reset_index(drop=True)
covid_df.head()

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
0,3/1/2020,DKI Jakarta,2,0,0,106.836118,-6.204699
1,3/2/2020,DKI Jakarta,2,0,0,106.836118,-6.204699
2,3/2/2020,Riau,1,0,0,101.805109,0.511648
3,3/3/2020,DKI Jakarta,2,0,0,106.836118,-6.204699
4,3/3/2020,Jawa Barat,1,1,0,107.603708,-6.920432


In [8]:
covid_df.describe(include="all")

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
count,30893,30893,30893.0,30893.0,30893.0,30893.0,30893.0
unique,929,34,,,,,
top,6/8/2021,DKI Jakarta,,,,,
freq,34,929,,,,,
mean,,,207.079565,5.108568,200.840093,113.693837,-2.783911
std,,,771.874452,22.344819,813.941207,10.009183,3.646021
min,,,0.0,0.0,0.0,96.910522,-8.682205
25%,,,3.0,0.0,2.0,106.109004,-6.204699
50%,,,24.0,0.0,18.0,112.732941,-2.993595
75%,,,113.0,3.0,106.0,121.201093,0.212037


In [9]:
covid_df.loc[covid_df["New Cases"] == covid_df["New Cases"].max()]

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
23729,2/17/2022,Jawa Barat,16251,13,3130,107.603708,-6.920432


### Data Exploration

In [10]:
pd.set_option('display.max_rows', 12)

In [11]:
min_value = covid_df["Date"].min()
max_value = covid_df["Date"].max()

print(f"The range date for this Analysis is {min_value} - {max_value}")

The range date for this Analysis is 1/1/2021 - 9/9/2022


In [12]:
covid_df.groupby(by="Location").Date.count()

Location
Aceh                 904
Bali                 919
Banten               924
Bengkulu             899
DKI Jakarta          929
                    ... 
Sulawesi Tenggara    920
Sulawesi Utara       904
Sumatera Barat       904
Sumatera Selatan     907
Sumatera Utara       913
Name: Date, Length: 34, dtype: int64

In [13]:
covid_df.sort_values(by="New Cases", ascending=False).head(5).reset_index(drop=True)

Unnamed: 0,Date,Location,New Cases,New Deaths,New Recovered,Longitude,Latitude
0,2/17/2022,Jawa Barat,16251,13,3130,107.603708,-6.920432
1,2/6/2022,DKI Jakarta,15825,28,8309,106.836118,-6.204699
2,2/16/2022,Jawa Barat,15196,15,5413,107.603708,-6.920432
3,7/12/2021,DKI Jakarta,14622,134,20477,106.836118,-6.204699
4,2/9/2022,DKI Jakarta,14353,17,8893,106.836118,-6.204699


In [14]:
covid_df["Emergency Level"] = "Green"
covid_df.loc[(covid_df["New Cases"] >= 3000) & (covid_df["New Cases"] < 6000), "Emergency Level"] = "Yellow"
covid_df.loc[(covid_df["New Cases"] >= 6000) & (covid_df["New Cases"] < 12000), "Emergency Level"] = "Orange"
covid_df.loc[covid_df["New Cases"] >= 12000, "Emergency Level"] = "Red"

covid_df.groupby(by="Emergency Level").Date.count().loc[["Green", "Yellow", "Orange", "Red"]]

Emergency Level
Green     30528
Yellow      238
Orange      102
Red          25
Name: Date, dtype: int64

In [15]:
covid_df["Date"] = pd.to_datetime(covid_df["Date"])

# Create new columns for year and month
covid_df["Year"] = covid_df["Date"].dt.year
covid_df["Month"] = covid_df["Date"].dt.month

# Group month by Quarter
covid_df.loc[(covid_df["Month"] >= 1) & (covid_df["Month"] < 4), "Quarter"] = "Q1"
covid_df.loc[(covid_df["Month"] >= 4) & (covid_df["Month"] < 7), "Quarter"] = "Q2"
covid_df.loc[(covid_df["Month"] >= 7) & (covid_df["Month"] < 10), "Quarter"] = "Q3"
covid_df.loc[(covid_df["Month"] >= 10) & (covid_df["Month"] < 13), "Quarter"] = "Q4"

covid_df.groupby(by=["Year", "Quarter"]).agg({
    "New Cases": "sum",
    "New Deaths": "sum",
    "New Recovered": "sum"
})

Unnamed: 0_level_0,Unnamed: 1_level_0,New Cases,New Deaths,New Recovered
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020,Q1,1020,186,191
2020,Q2,56586,3136,28043
2020,Q3,233326,8008,186237
2020,Q4,459916,11663,409289
2021,Q1,762803,18282,729770
2021,Q2,669353,17861,530259
2021,Q3,2031992,82916,2152458
2021,Q4,47653,2084,77825
2022,Q1,1750101,11018,1635253
2022,Q2,75649,1623,164878
