# COVID-19 in India - Analysis, Visualization & Comparisons

In [25]:
# Table of Content
# Libraries
# Dataset
# Preprocessing
# EDA- Exploratory Data Analysis
# Visualizations

# Libraries

In [26]:
#importing libraries
#process data in the form of dataframes
import pandas as pd
#numerical analysis
import numpy as np
#for vizualization
import matplotlib.pyplot as plt
#advanced vizualization
import seaborn as sns

In [27]:
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
import gc
import warnings
warnings.filterwarnings("ignore")


# Dataset

In [28]:
#Dataset from https://www.kaggle.com/sudalairajkumar/covid19-in-india
#Data from 2020-01-30 till 2021-08-11
df = pd.read_csv('C:\\Users\\khushi\\Desktop\\py\\India Covid Analysis\\archive\\covid_19_india.csv')
df.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [29]:
df.isnull().sum()

Sno                         0
Date                        0
Time                        0
State/UnionTerritory        0
ConfirmedIndianNational     0
ConfirmedForeignNational    0
Cured                       0
Deaths                      0
Confirmed                   0
dtype: int64

# Data Preprocessing

In [30]:
df = df.rename(columns={'Cured': 'Recovered','ConfirmedIndianNational':'Confirmed_Indian_National', 'ConfirmedForeignNational': 'Confirmed_Foreign_National'})
df.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,Confirmed_Indian_National,Confirmed_Foreign_National,Recovered,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [31]:
df.drop(['Sno', 'Time'], inplace = True, axis = 1)
df.head()

Unnamed: 0,Date,State/UnionTerritory,Confirmed_Indian_National,Confirmed_Foreign_National,Recovered,Deaths,Confirmed
0,2020-01-30,Kerala,1,0,0,0,1
1,2020-01-31,Kerala,1,0,0,0,1
2,2020-02-01,Kerala,2,0,0,0,2
3,2020-02-02,Kerala,3,0,0,0,3
4,2020-02-03,Kerala,3,0,0,0,3


In [32]:
df['Active'] = df['Confirmed'] - (df['Recovered'] +df['Deaths'])
df.tail()

Unnamed: 0,Date,State/UnionTerritory,Confirmed_Indian_National,Confirmed_Foreign_National,Recovered,Deaths,Confirmed,Active
18105,2021-08-11,Telangana,-,-,638410,3831,650353,8112
18106,2021-08-11,Tripura,-,-,77811,773,80660,2076
18107,2021-08-11,Uttarakhand,-,-,334650,7368,342462,444
18108,2021-08-11,Uttar Pradesh,-,-,1685492,22775,1708812,545
18109,2021-08-11,West Bengal,-,-,1506532,18252,1534999,10215


In [33]:
df = df.replace('-', 0)

In [34]:
#print (df['Date'].dtype)
#df['Date'] = pd.to_datetime(df['Date'])
#print (df['Date'].dtype)

In [35]:
total = ['All', 'All', 'All', 'All', df['Recovered'].sum(), df['Deaths'].sum(), df['Confirmed'].sum(), df['Active'].sum()]
df.loc[len(df.index)] = total
df.tail()

Unnamed: 0,Date,State/UnionTerritory,Confirmed_Indian_National,Confirmed_Foreign_National,Recovered,Deaths,Confirmed,Active
18106,2021-08-11,Tripura,0,0,77811,773,80660,2076
18107,2021-08-11,Uttarakhand,0,0,334650,7368,342462,444
18108,2021-08-11,Uttar Pradesh,0,0,1685492,22775,1708812,545
18109,2021-08-11,West Bengal,0,0,1506532,18252,1534999,10215
18110,All,All,All,All,5046125452,73389005,5451678687,332164230


# Exploratory Data Analysis

In [36]:
cases = df[['Active', 'Recovered', 'Deaths']].loc[len(df.index)-1]
cases_df = pd.DataFrame(cases).reset_index()

cases_df.columns = ['Type', 'Total']

cases_df['Percentage'] = np.round(100*cases_df['Total']/np.sum(cases_df['Total']), 2)
cases_df

Unnamed: 0,Type,Total,Percentage
0,Active,332164230,6.09
1,Recovered,5046125452,92.56
2,Deaths,73389005,1.35


In [37]:
#sns.barplot(x="Type", y="Total", data=cases_df)

In [38]:
fig = go.Figure()

fig.add_trace(go.Bar(x = cases_df['Type'], y = cases_df['Percentage'], marker_color = ['Cyan', 'Pink', 'Purple']))
fig.update_xaxes(title_text="Type")
fig.update_yaxes(title_text="Percentage")
fig.show()

In [39]:
#cleaning dataset
df['State/UnionTerritory'] = df['State/UnionTerritory'].replace(['Bihar****','Himachal Pradesh', 'Dadra and Nagar Haveli', 'Daman & Diu', 'Karanataka', 'Madhya Pradesh***', 'Maharashtra***', 'Telengana', 'Cases being reassigned to states'], 
                 ['Bihar', 'Himanchal Pradesh', 'Dadra and Nagar Haveli and Daman and Diu', 'Dadra and Nagar Haveli and Daman and Diu', 'Karnataka','Madhya Pradesh', 'Maharashtra', 'Telangana', 'Unassigned'])

In [40]:
state_df = df.groupby('State/UnionTerritory').sum().drop('All')

In [41]:
state_df = state_df.reset_index()
state_df

Unnamed: 0,State/UnionTerritory,Recovered,Deaths,Confirmed,Active
0,Andaman and Nicobar Islands,1848286,27136,1938498,63076
1,Andhra Pradesh,370426530,2939367,392432753,19066856
2,Arunachal Pradesh,6588149,26799,7176907,561959
3,Assam,92678680,638323,99837011,6520008
4,Bihar,126525370,1112347,133662075,6024358
5,Chandigarh,10117035,147694,10858627,593898
6,Chhattisgarh,151609364,2063920,163776262,10102978
7,Dadra and Nagar Haveli and Daman and Diu,1862102,1022,1959356,96232
8,Delhi,273419887,4943294,287227765,8864584
9,Goa,26027201,447801,28240159,1765157


In [42]:
def state_vizualization(state_list):
    for label in state_list:
        s_df = state_df[['State/UnionTerritory', label]]
        s_df['Percentage'] = np.round(100*s_df[label]/ np.sum(s_df[label]), 2)
        s_df['Virus'] = ['Covid-19' for i in range(len(s_df))]
        
        fig = px.bar(s_df, x = 'Virus', y = 'Percentage', color = 'State/UnionTerritory', hover_data = [label])
        fig.update_layout(title = {"text":f"{label}" })
        fig.show()
        gc.collect

In [43]:
cases_list = ["Confirmed", "Active"]
deaths_list = ["Deaths"]
recovered_list = ["Recovered"]

# Visualization

In [44]:
state_vizualization(cases_list)

In [45]:
state_vizualization(deaths_list)

In [46]:
state_vizualization(recovered_list)

In [47]:
top_state = 10
state = state_df.columns[1:5]

fig = go.Figure()
s = 0
for i in state_df.index:
    if s < top_state:
        fig.add_trace(go.Bar(name = state_df['State/UnionTerritory'][i], x = state, y = state_df.loc[i][1:5]))
    else:
        break
    s +=1
    
fig.update_layout(title = {"text":f'Top {top_state} States/UnionTerritories affected '}, yaxis_type = "log")
fig.show()