In [20]:
# libraries

from datetime import datetime
import os
import glob
import requests 
import pandas as pd
from bs4 import BeautifulSoup

# Web Scrapping

In [21]:
# web scrapping

link = 'https://www.mohfw.gov.in/'
req = requests.get(link)
soup = BeautifulSoup(req.content, "html.parser")

thead = soup.find_all('thead')[1]
head = thead.find_all('tr')

tbody = soup.find_all('tbody')[1]
body = tbody.find_all('tr')

# print(rows)

head_rows = []
body_rows = []

for tr in head:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    head_rows.append(row)
    
for tr in body:
    td = tr.find_all(['th', 'td'])
    row = [i.text for i in td]
    body_rows.append(row)
    
print(head_rows)
    
df_bs = pd.DataFrame(body_rows[:len(body_rows)-1], columns=head_rows[0])
    
df_bs.drop('S. No.', axis=1, inplace=True)
df_bs.head(36)

[['S. No.', 'Name of State / UT', 'Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 'Cured/Discharged/Migrated', 'Death']]


Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death
0,Andhra Pradesh,3,0,0,0
1,Chhattisgarh,1,0,0,0
2,Delhi,26,1,5,1
3,Gujarat,14,0,0,0
4,Haryana,3,14,0,0
5,Himachal Pradesh,2,0,0,0
6,Karnataka,20,0,2,1
7,Kerala,45,7,3,0
8,Madhya Pradesh,4,0,0,0
9,Maharashtra,60,3,0,2


# Data Cleaning

In [22]:
# date-time information
# ---------------------

now  = datetime.now()
df_bs['Date'] = now.strftime("%m/%d/%Y") 
df_bs['Date'] = pd.to_datetime(df_bs['Date'], format='%m/%d/%Y')
df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date
0,Andhra Pradesh,3,0,0,0,2020-03-22
1,Chhattisgarh,1,0,0,0,2020-03-22
2,Delhi,26,1,5,1,2020-03-22
3,Gujarat,14,0,0,0,2020-03-22
4,Haryana,3,14,0,0,2020-03-22
5,Himachal Pradesh,2,0,0,0,2020-03-22
6,Karnataka,20,0,2,1,2020-03-22
7,Kerala,45,7,3,0,2020-03-22
8,Madhya Pradesh,4,0,0,0,2020-03-22
9,Maharashtra,60,3,0,2,2020-03-22


In [23]:
df_bs['Name of State / UT'].unique()

array(['Andhra Pradesh', 'Chhattisgarh', 'Delhi', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Karnataka', 'Kerala', 'Madhya Pradesh',
       'Maharashtra', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan',
       'Tamil Nadu', 'Telengana', 'Chandigarh', 'Jammu and Kashmir',
       'Ladakh', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'],
      dtype=object)

In [24]:
# latitude and longitude information
# ----------------------------------

lat = {'Delhi':28.7041,
       'Haryana':29.0588,
       'Kerala':10.8505,
       'Rajasthan':27.0238,
       'Telengana':18.1124,
       'Uttar Pradesh':26.8467,
       'Ladakh':34.2996,
       'Tamil Nadu':11.1271,
       'Jammu and Kashmir':33.7782,
       'Punjab':31.1471,
       'Karnataka':15.3173,
       'Maharashtra':19.7515,
       'Andhra Pradesh':15.9129, 
       'Odisha':20.9517, 
       'Uttarakhand':30.0668, 
       'West Bengal':22.9868, 
       'Puducherry': 11.9416, 
       'Chandigarh': 30.7333, 
       'Chhattisgarh':21.2787, 
       'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 
       'Madhya Pradesh':   22.9734}

long = {'Delhi':77.1025,
        'Haryana':76.0856,
        'Kerala':76.2711,
        'Rajasthan':74.2179,
        'Telengana':79.0193,
        'Uttar Pradesh':80.9462,
        'Ladakh':78.2932,
        'Tamil Nadu':78.6569,
        'Jammu and Kashmir':76.5762,
        'Punjab':75.3412,
        'Karnataka':75.7139,
        'Maharashtra':75.7139,
        'Andhra Pradesh':79.7400, 
        'Odisha':85.0985, 
        'Uttarakhand':79.0193, 
        'West Bengal':87.8550, 
        'Puducherry': 79.8083, 
        'Chandigarh': 76.7794, 
        'Chhattisgarh':81.8661, 
        'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 
        'Madhya Pradesh':   78.6569}

df_bs['Latitude'] = df_bs['Name of State / UT'].map(lat)
df_bs['Longitude'] = df_bs['Name of State / UT'].map(long)

df_bs.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,3,0,0,0,2020-03-22,15.9129,79.74
1,Chhattisgarh,1,0,0,0,2020-03-22,21.2787,81.8661
2,Delhi,26,1,5,1,2020-03-22,28.7041,77.1025
3,Gujarat,14,0,0,0,2020-03-22,22.2587,71.1924
4,Haryana,3,14,0,0,2020-03-22,29.0588,76.0856


In [25]:
df_bs.isna().sum()

Name of State / UT                            0
Total Confirmed cases (Indian National)       0
Total Confirmed cases ( Foreign National )    0
Cured/Discharged/Migrated                     0
Death                                         0
Date                                          0
Latitude                                      0
Longitude                                     0
dtype: int64

# Saving data

In [26]:
# saving data
# -----------

file_name = now.strftime("%Y_%m_%d")+'.csv'
file_loc = 'C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\'
df_bs.to_csv(file_loc + file_name, index=False)

df_bs.head(36)

Unnamed: 0,Name of State / UT,Total Confirmed cases (Indian National),Total Confirmed cases ( Foreign National ),Cured/Discharged/Migrated,Death,Date,Latitude,Longitude
0,Andhra Pradesh,3,0,0,0,2020-03-22,15.9129,79.74
1,Chhattisgarh,1,0,0,0,2020-03-22,21.2787,81.8661
2,Delhi,26,1,5,1,2020-03-22,28.7041,77.1025
3,Gujarat,14,0,0,0,2020-03-22,22.2587,71.1924
4,Haryana,3,14,0,0,2020-03-22,29.0588,76.0856
5,Himachal Pradesh,2,0,0,0,2020-03-22,31.1048,77.1734
6,Karnataka,20,0,2,1,2020-03-22,15.3173,75.7139
7,Kerala,45,7,3,0,2020-03-22,10.8505,76.2711
8,Madhya Pradesh,4,0,0,0,2020-03-22,22.9734,78.6569
9,Maharashtra,60,3,0,2,2020-03-22,19.7515,75.7139


In [27]:
df_bs.columns

Index(['Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Death', 'Date', 'Latitude', 'Longitude'],
      dtype='object')

# Combining data

In [28]:
! ls C:\Users\imdevskp\Desktop\covid_india\.day_by_day_data

2020_03_21.csv
2020_03_22.csv


In [29]:
# pd.read_csv?

In [30]:
# complete data

loc = "C:\\Users\\imdevskp\\Desktop\\covid_india\\.day_by_day_data\\"

files = glob.glob(loc+'2020*.csv')
dfs = []
for i in files:
    df_temp = pd.read_csv(i)
    df_temp = df_temp.rename(columns={'Cured':'Cured/Discharged'})
    df_temp = df_temp.rename(columns={'Cured/Discharged':'Cured/Discharged/Migrated'})
    dfs.append(df_temp)
    
print(dfs)

complete_data = pd.concat(dfs, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
complete_data['Date'] = pd.to_datetime(complete_data['Date'])
complete_data = complete_data.sort_values(['Date', 'Name of State / UT']).reset_index(drop=True)

cols = ['Total Confirmed cases (Indian National)', 'Total Confirmed cases ( Foreign National )', 
              'Cured/Discharged/Migrated', 'Death']

complete_data[cols] = complete_data[cols].fillna(0).astype('int')

# complete_data.tail(50)

[          Date Name of State / UT  Total Confirmed cases (Indian National)  \
0    1/30/2020             Kerala                                        1   
1    1/31/2020             Kerala                                        1   
2     2/1/2020             Kerala                                        2   
3     2/2/2020             Kerala                                        3   
4     2/3/2020             Kerala                                        3   
..         ...                ...                                      ...   
265  3/21/2020         Tamil Nadu                                        3   
266  3/21/2020          Telengana                                       10   
267  3/21/2020      Uttar Pradesh                                       23   
268  3/21/2020        Uttarakhand                                        3   
269  3/21/2020        West Bengal                                        3   

     Total Confirmed cases ( Foreign National )  Cured/Dischar

In [31]:
complete_data.columns

Index(['Date', 'Name of State / UT', 'Total Confirmed cases (Indian National)',
       'Total Confirmed cases ( Foreign National )',
       'Cured/Discharged/Migrated', 'Latitude', 'Longitude', 'Death'],
      dtype='object')

In [32]:
complete_data['Name of State / UT'].replace('Chattisgarh', 'Chhattisgarh', inplace=True)
complete_data['Name of State / UT'].replace('Pondicherry', 'Puducherry', inplace=True) 

In [33]:
complete_data['Name of State / UT'].unique()

array(['Kerala', 'Delhi', 'Telengana', 'Rajasthan', 'Haryana',
       'Uttar Pradesh', 'Tamil Nadu', 'Union Territory of Ladakh',
       'Karnataka', 'Maharashtra', 'Punjab',
       'Union Territory of Jammu and Kashmir', 'Andhra Pradesh',
       'Uttarakhand', 'Odisha', 'Puducherry', 'West Bengal',
       'Chhattisgarh', 'Union Territory of Chandigarh', 'Gujarat',
       'Chandigarh', 'Himachal Pradesh', 'Jammu and Kashmir', 'Ladakh',
       'Madhya Pradesh'], dtype=object)

In [34]:
# sorted(complete_data['Name of State / UT'].unique())

In [35]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 8 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Date                                        292 non-null    datetime64[ns]
 1   Name of State / UT                          292 non-null    object        
 2   Total Confirmed cases (Indian National)     292 non-null    int32         
 3   Total Confirmed cases ( Foreign National )  292 non-null    int32         
 4   Cured/Discharged/Migrated                   292 non-null    int32         
 5   Latitude                                    292 non-null    float64       
 6   Longitude                                   292 non-null    float64       
 7   Death                                       292 non-null    int32         
dtypes: datetime64[ns](1), float64(2), int32(4), object(1)
memory usage: 13.8+ KB


In [39]:
complete_data.to_csv('complete.csv', index=False)