# Air Quality Control
#### India 2020

https://data.gov.in/resources/real-time-air-quality-index-various-locations/api

In [1]:
# Used for fetching Data from the source
import requests

# Used for Data Wrangling & Analysis Task 
import pandas as pd

## Fetching data ...

In [2]:
apiVal = {
    'api-key' : '579b464db66ec23bdd000001cdd3946e44ce4aad7209ff7b23ac571b',
    'format' : 'json',
    'limit' : '10'
}

print("Data Fetch - Initiated")

listOfRecords = []
airQualityRecords = []
for i in range(0, 1500, 10): # Values are set based on the API requirements.
  responseObject = requests.get('https://api.data.gov.in/resource/3b01bcb8-0b14-4abf-b6f2-c1bfd384ba69?api-key={}&format={}&offset={}&limit={}'.format( apiVal['api-key'],apiVal['format'], i,apiVal['limit']))
  if responseObject.status_code == 200:
    listOfRecords.append(responseObject.json())
    airQualityRecords.extend(responseObject.json()["records"])
  else:
    break

print("Data Fetch - Complete")

Data Fetch - Initiated
Data Fetch - Complete


In [3]:
print( "No of Requests Made :: {} \t Total Data Points Fetched:: {} \t ".format(len(listOfRecords), len(airQualityRecords)) )

No of Requests Made :: 150 	 Total Data Points Fetched:: 1416 	 


In [4]:
# Storing data into a structured format
dfData = pd.DataFrame(airQualityRecords)

In [5]:
# Looking at its summary 
dfData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416 entries, 0 to 1415
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              1416 non-null   object
 1   country         1416 non-null   object
 2   state           1416 non-null   object
 3   city            1416 non-null   object
 4   station         1416 non-null   object
 5   last_update     1416 non-null   object
 6   pollutant_id    1416 non-null   object
 7   pollutant_min   1416 non-null   object
 8   pollutant_max   1416 non-null   object
 9   pollutant_avg   1416 non-null   object
 10  pollutant_unit  1416 non-null   object
dtypes: object(11)
memory usage: 121.8+ KB


In [6]:
# Exploring the initial records
dfData.head(2)

Unnamed: 0,id,country,state,city,station,last_update,pollutant_id,pollutant_min,pollutant_max,pollutant_avg,pollutant_unit
0,1,India,Andhra_Pradesh,Amaravati,"Secretariat, Amaravati - APPCB",20-10-2020 10:00:00,PM2.5,12,40,21,
1,2,India,Andhra_Pradesh,Amaravati,"Secretariat, Amaravati - APPCB",20-10-2020 10:00:00,PM10,13,45,26,


In [7]:
# Checking for irregular data
len(dfData[ (dfData.pollutant_min == 'NA') | (dfData.pollutant_max == 'NA') | (dfData.pollutant_avg == 'NA')])

111

In [8]:
# Cleaning the data
cleanDf = dfData[ ~(dfData.pollutant_min == 'NA') | ~(dfData.pollutant_max == 'NA') | ~(dfData.pollutant_avg == 'NA')].copy()

In [9]:
cleanDf.shape

(1305, 11)

In [10]:
cleanDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1305 entries, 0 to 1415
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              1305 non-null   object
 1   country         1305 non-null   object
 2   state           1305 non-null   object
 3   city            1305 non-null   object
 4   station         1305 non-null   object
 5   last_update     1305 non-null   object
 6   pollutant_id    1305 non-null   object
 7   pollutant_min   1305 non-null   object
 8   pollutant_max   1305 non-null   object
 9   pollutant_avg   1305 non-null   object
 10  pollutant_unit  1305 non-null   object
dtypes: object(11)
memory usage: 122.3+ KB


In [11]:
# Handling numerical data
cleanDf['pollutant_min'] = cleanDf['pollutant_min'].astype(float)
cleanDf['pollutant_max'] = cleanDf['pollutant_max'].astype(float)
cleanDf['pollutant_avg'] = cleanDf['pollutant_avg'].astype(float)

In [12]:
cleanDf.city.nunique()

119

### Story Telling

In [13]:
print("Data Comprises of :: -\n\n ")

print("No of Countries :: {}".format( cleanDf["country"].nunique() ) )
print("\t Countries :: {}".format( cleanDf["country"].unique() ) )
print("No of States :: {}".format( cleanDf["state"].nunique() ) )
print("\t States :: {}".format( cleanDf["state"].unique() ) )

print("No of Cities :: {}".format( cleanDf["city"].nunique() ) )
print("\t Cities :: {}".format( cleanDf["city"].unique() ) )
print("No of Station :: {}".format( cleanDf["station"].nunique() ) )
# print("\t Station :: {}".format( cleanDf["station"].unique() ) ) // list is long

Data Comprises of :: -

 
No of Countries :: 1
	 Countries :: ['India']
No of States :: 20
	 States :: ['Andhra_Pradesh' 'Assam' 'Bihar' 'Chandigarh' 'Delhi' 'Gujarat' 'Haryana'
 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra' 'Meghalaya'
 'Nagaland' 'Odisha' 'Punjab' 'Rajasthan' 'TamilNadu' 'Telangana'
 'Uttar_Pradesh' 'West_Bengal']
No of Cities :: 119
	 Cities :: ['Amaravati' 'Rajamahendravaram' 'Tirupati' 'Visakhapatnam' 'Guwahati'
 'Gaya' 'Hajipur' 'Muzaffarpur' 'Patna' 'Chandigarh' 'Delhi' 'Ahmedabad'
 'Ankleshwar' 'Gandhinagar' 'Vapi' 'Vatva' 'Ambala' 'Bahadurgarh'
 'Ballabgarh' 'Bhiwani' 'Charkhi Dadri' 'Dharuhera' 'Faridabad'
 'Fatehabad' 'Gurugram' 'Hisar' 'Jind' 'Kaithal' 'Karnal' 'Kurukshetra'
 'Mandikhera' 'Manesar' 'Narnaul' 'Palwal' 'Panchkula' 'Panipat' 'Rohtak'
 'Sirsa' 'Sonipat' 'Yamunanagar' 'Bengaluru' 'Chikkaballapur'
 'Chikkamagaluru' 'Hubballi' 'Kalaburgi' 'Mysuru' 'Ramanagara'
 'Vijayapura' 'Yadgir' 'Eloor' 'Kannur' 'Kollam' 'Kozhikode'
 'Thiruvananthapuram'

In [14]:
print(" Different pollutants captured - {}".format(cleanDf["pollutant_id"].unique()))

 Different pollutants captured - ['PM2.5' 'PM10' 'NO2' 'NH3' 'SO2' 'CO' 'OZONE']


In [15]:
# Further diving into the data

cleanDf.groupby(['pollutant_id', 'city'])['pollutant_min',	'pollutant_avg','pollutant_max'].sum()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,pollutant_min,pollutant_avg,pollutant_max
pollutant_id,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,Agra,12.0,46.0,91.0
CO,Ahmedabad,8.0,53.0,103.0
CO,Ajmer,5.0,46.0,81.0
CO,Alwar,4.0,56.0,108.0
CO,Amaravati,18.0,41.0,64.0
...,...,...,...,...
SO2,Udaipur,3.0,14.0,41.0
SO2,Vatva,12.0,24.0,45.0
SO2,Vijayapura,1.0,3.0,5.0
SO2,Visakhapatnam,2.0,11.0,21.0


In [16]:
cleanDf.groupby(['city','pollutant_id'])['pollutant_min',	'pollutant_avg','pollutant_max'].sum()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,pollutant_min,pollutant_avg,pollutant_max
city,pollutant_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Agra,CO,12.0,46.0,91.0
Agra,NO2,25.0,56.0,116.0
Agra,OZONE,19.0,87.0,192.0
Agra,PM2.5,97.0,247.0,313.0
Agra,SO2,1.0,18.0,82.0
...,...,...,...,...
Yamunanagar,NO2,16.0,58.0,107.0
Yamunanagar,OZONE,6.0,68.0,110.0
Yamunanagar,PM10,83.0,220.0,404.0
Yamunanagar,PM2.5,69.0,254.0,378.0


In [17]:
# High Risk of Each Pollutant today

for singlePL in cleanDf['pollutant_id'].unique():
  print("For pollutant - {} ".format(singlePL))
  tempDF = cleanDf[(cleanDf["pollutant_id"]== singlePL)].sort_values('pollutant_avg', ascending=False).head(1).copy()
  print(" \t Place at risk - {}".format(tempDF[['state', 'city','station','pollutant_avg']].to_dict("records")[0]) )

For pollutant - PM2.5 
 	 Place at risk - {'state': 'Uttar_Pradesh', 'city': 'Lucknow', 'station': 'Talkatora District Industries Center, Lucknow - CPCB', 'pollutant_avg': 341.0}
For pollutant - PM10 
 	 Place at risk - {'state': 'Haryana', 'city': 'Ballabgarh', 'station': 'Nathu Colony, Ballabgarh - HSPCB', 'pollutant_avg': 292.0}
For pollutant - NO2 
 	 Place at risk - {'state': 'Uttar_Pradesh', 'city': 'Meerut', 'station': 'Jai Bhim Nagar, Meerut - UPPCB', 'pollutant_avg': 192.0}
For pollutant - NH3 
 	 Place at risk - {'state': 'Maharashtra', 'city': 'Nagpur', 'station': 'Opp GPO Civil Lines, Nagpur - MPCB', 'pollutant_avg': 60.0}
For pollutant - SO2 
 	 Place at risk - {'state': 'Haryana', 'city': 'Jind', 'station': 'Police Lines, Jind - HSPCB', 'pollutant_avg': 75.0}
For pollutant - CO 
 	 Place at risk - {'state': 'Delhi', 'city': 'Delhi', 'station': 'ITO, Delhi - CPCB', 'pollutant_avg': 185.0}
For pollutant - OZONE 
 	 Place at risk - {'state': 'Haryana', 'city': 'Bhiwani', 'st

In [18]:
# Historical High

for singlePL in cleanDf['pollutant_id'].unique():
  print("For pollutant - {} ".format(singlePL))
  tempDF = cleanDf[(cleanDf["pollutant_id"]== singlePL)].sort_values('pollutant_max', ascending=False).head(1).copy()
  print(" \t Place at risk - {}".format(tempDF[['state', 'city','station','pollutant_max']].to_dict("records")[0]) )

For pollutant - PM2.5 
 	 Place at risk - {'state': 'Haryana', 'city': 'Manesar', 'station': 'Sector-2 IMT, Manesar - HSPCB', 'pollutant_max': 457.0}
For pollutant - PM10 
 	 Place at risk - {'state': 'Delhi', 'city': 'Delhi', 'station': 'Dwarka-Sector 8, Delhi - DPCC ', 'pollutant_max': 500.0}
For pollutant - NO2 
 	 Place at risk - {'state': 'Uttar_Pradesh', 'city': 'Meerut', 'station': 'Jai Bhim Nagar, Meerut - UPPCB', 'pollutant_max': 403.0}
For pollutant - NH3 
 	 Place at risk - {'state': 'Rajasthan', 'city': 'Jaipur', 'station': 'Police Commissionerate, Jaipur - RSPCB', 'pollutant_max': 86.0}
For pollutant - SO2 
 	 Place at risk - {'state': 'Haryana', 'city': 'Rohtak', 'station': 'MD University, Rohtak - HSPCB', 'pollutant_max': 133.0}
For pollutant - CO 
 	 Place at risk - {'state': 'Karnataka', 'city': 'Bengaluru', 'station': 'BWSSB Kadabesanahalli, Bengaluru - CPCB', 'pollutant_max': 196.0}
For pollutant - OZONE 
 	 Place at risk - {'state': 'Delhi', 'city': 'Delhi', 'statio

In [19]:
# Low Risk of Each Pollutant today

for singlePL in cleanDf['pollutant_id'].unique():
  print("For pollutant - {} ".format(singlePL))
  tempDF = cleanDf[(cleanDf["pollutant_id"]== singlePL)].sort_values('pollutant_avg', ascending=True).head(1).copy()
  print(" \t Place at risk - {}".format(tempDF[['state', 'city','station','pollutant_avg']].to_dict("records")[0]) )

For pollutant - PM2.5 
 	 Place at risk - {'state': 'Kerala', 'city': 'Eloor', 'station': 'Udyogamandal, Eloor - Kerala PCB', 'pollutant_avg': 6.0}
For pollutant - PM10 
 	 Place at risk - {'state': 'Karnataka', 'city': 'Mysuru', 'station': 'Hebbal 1st Stage, Mysuru - KSPCB', 'pollutant_avg': 19.0}
For pollutant - NO2 
 	 Place at risk - {'state': 'Maharashtra', 'city': 'Mumbai', 'station': 'Bandra, Mumbai - MPCB', 'pollutant_avg': 1.0}
For pollutant - NH3 
 	 Place at risk - {'state': 'Kerala', 'city': 'Kollam', 'station': 'Polayathode, Kollam - Kerala PCB', 'pollutant_avg': 1.0}
For pollutant - SO2 
 	 Place at risk - {'state': 'Telangana', 'city': 'Hyderabad', 'station': 'Central University, Hyderabad - TSPCB', 'pollutant_avg': 1.0}
For pollutant - CO 
 	 Place at risk - {'state': 'Gujarat', 'city': 'Vapi', 'station': 'Phase-1 GIDC, Vapi - GPCB', 'pollutant_avg': 3.0}
For pollutant - OZONE 
 	 Place at risk - {'state': 'Meghalaya', 'city': 'Shillong', 'station': 'Lumpyngngad, Shillo

In [20]:
# Low Risk of Each Pollutant today

for singlePL in cleanDf['pollutant_id'].unique():
  print("For pollutant - {} ".format(singlePL))
  tempDF = cleanDf[(cleanDf["pollutant_id"]== singlePL)].sort_values('pollutant_max', ascending=True).head(1).copy()
  print(" \t Place at risk - {}".format(tempDF[['state', 'city','station','pollutant_max']].to_dict("records")[0]) )

For pollutant - PM2.5 
 	 Place at risk - {'state': 'Kerala', 'city': 'Eloor', 'station': 'Udyogamandal, Eloor - Kerala PCB', 'pollutant_max': 9.0}
For pollutant - PM10 
 	 Place at risk - {'state': 'Kerala', 'city': 'Kannur', 'station': 'Thavakkara, Kannur - Kerala PCB', 'pollutant_max': 27.0}
For pollutant - NO2 
 	 Place at risk - {'state': 'Maharashtra', 'city': 'Mumbai', 'station': 'Bandra, Mumbai - MPCB', 'pollutant_max': 1.0}
For pollutant - NH3 
 	 Place at risk - {'state': 'Maharashtra', 'city': 'Mumbai', 'station': 'Worli, Mumbai - MPCB', 'pollutant_max': 1.0}
For pollutant - SO2 
 	 Place at risk - {'state': 'Kerala', 'city': 'Kozhikode', 'station': 'Palayam, Kozhikode - Kerala PCB', 'pollutant_max': 3.0}
For pollutant - CO 
 	 Place at risk - {'state': 'Gujarat', 'city': 'Vapi', 'station': 'Phase-1 GIDC, Vapi - GPCB', 'pollutant_max': 5.0}
For pollutant - OZONE 
 	 Place at risk - {'state': 'Bihar', 'city': 'Hajipur', 'station': 'Industrial Area, Hajipur - BSPCB', 'pollutan

In [21]:
# searching a particular city

cleanDf[cleanDf.apply(lambda x: True if (x.city).lower().find("pur".lower()) >= 0 else False, axis=1)]

Unnamed: 0,id,country,state,city,station,last_update,pollutant_id,pollutant_min,pollutant_max,pollutant_avg,pollutant_unit
42,43,India,Bihar,Hajipur,"Industrial Area, Hajipur - BSPCB",20-10-2020 10:00:00,PM2.5,90.0,328.0,195.0,
43,44,India,Bihar,Hajipur,"Industrial Area, Hajipur - BSPCB",20-10-2020 10:00:00,PM10,88.0,203.0,141.0,
44,45,India,Bihar,Hajipur,"Industrial Area, Hajipur - BSPCB",20-10-2020 10:00:00,NO2,17.0,33.0,24.0,
45,46,India,Bihar,Hajipur,"Industrial Area, Hajipur - BSPCB",20-10-2020 10:00:00,NH3,1.0,2.0,1.0,
46,47,India,Bihar,Hajipur,"Industrial Area, Hajipur - BSPCB",20-10-2020 10:00:00,SO2,3.0,7.0,5.0,
...,...,...,...,...,...,...,...,...,...,...,...
1250,1251,India,Uttar_Pradesh,Hapur,"Anand Vihar, Hapur - UPPCB",20-10-2020 10:00:00,OZONE,1.0,2.0,2.0,
1251,1252,India,Uttar_Pradesh,Kanpur,"Nehru Nagar, Kanpur - UPPCB",20-10-2020 10:00:00,PM2.5,124.0,307.0,242.0,
1252,1253,India,Uttar_Pradesh,Kanpur,"Nehru Nagar, Kanpur - UPPCB",20-10-2020 10:00:00,NO2,37.0,277.0,85.0,
1253,1254,India,Uttar_Pradesh,Kanpur,"Nehru Nagar, Kanpur - UPPCB",20-10-2020 10:00:00,SO2,5.0,12.0,8.0,
