## Introduction to the Data

In [1]:
import pandas as pd
import numpy as np


In [2]:
df18 = pd.read_csv('2018.csv', parse_dates=['Date'])
df18.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,2018-04-19,HR,Zagreb,pm10,72,12.0,66.0,19.0,1034.64
1,2018-05-03,HR,Zagreb,pm10,72,5.0,46.0,20.0,740.53
2,2018-05-08,HR,Zagreb,pm10,69,7.0,33.0,17.0,286.35
3,2018-05-31,HR,Zagreb,pm10,48,15.0,60.0,25.0,704.61
4,2018-06-22,HR,Zagreb,pm10,62,1.0,60.0,7.0,670.06


In [3]:
df18['Specie'].unique()

array(['pm10', 'temperature', 'wind-speed', 'wind-gust', 'co', 'o3',
       'so2', 'no2', 'humidity', 'pressure', 'pm25', 'precipitation',
       'wd', 'd', 'uvi', 'aqi', 'pol', 'pm1', 'mepaqi'], dtype=object)

## Data Cleaning

### Data Cleaning 2018 DataFrame

In [4]:
df18 = pd.read_csv('2018.csv',parse_dates=['Date'])
df18.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,2018-04-19,HR,Zagreb,pm10,72,12.0,66.0,19.0,1034.64
1,2018-05-03,HR,Zagreb,pm10,72,5.0,46.0,20.0,740.53
2,2018-05-08,HR,Zagreb,pm10,69,7.0,33.0,17.0,286.35
3,2018-05-31,HR,Zagreb,pm10,48,15.0,60.0,25.0,704.61
4,2018-06-22,HR,Zagreb,pm10,62,1.0,60.0,7.0,670.06


In [5]:
df18['City'].unique()

array(['Zagreb', 'Rijeka', 'Split', 'Stockholm', 'Göteborg', 'Malmö',
       'Uppsala', 'Pristina', 'São José dos Campos', 'São Paulo',
       'Hamilton', 'Calgary', 'Winnipeg', 'Halifax', 'Kitchener',
       'Edmonton', 'Surrey', 'Mississauga', 'Québec', 'Vancouver',
       'Victoria', 'Montréal', 'Toronto', 'Ottawa', 'London', 'Bogotá',
       'Medellín', 'Copenhagen', 'Las Palmas de Gran Canaria',
       'Salamanca', 'Barcelona', 'Donostia / San Sebastián',
       'Gasteiz / Vitoria', 'Córdoba', 'Valladolid', 'Santander', 'Palma',
       'Málaga', 'Sevilla', 'Bilbao', 'Pamplona', 'Castelló de la Plana',
       'Huelva', 'Granada', 'Madrid', 'Valencia', 'Burgos', 'Murcia',
       'Santa Cruz de Tenerife', 'Oviedo', 'Utrecht', 'Nijmegen',
       'Haarlem', 'Eindhoven', 'Rotterdam', 'Amsterdam', 'Dordrecht',
       'Breda', 'Groningen', 'Maastricht', 'The Hague', 'Abu Dhabi',
       'Dubai', 'Baguio', 'Manila', 'Butuan', 'Zamboanga', 'Cochabamba',
       'Nicosia', 'Limassol', 'Ḩawallī

#### Extracting and Cleaning Data from Chennai

In [6]:
chennai = df18[df18['City'] == 'Chennai']
chennai.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
542590,2018-06-11,IN,Chennai,pressure,72,0.0,1005.0,0.0,2267540.0
542591,2018-05-19,IN,Chennai,pressure,69,0.0,1007.0,0.0,2325260.0
542592,2018-05-29,IN,Chennai,pressure,73,0.0,1003.0,0.0,2241080.0
542593,2018-05-24,IN,Chennai,pressure,71,0.0,1006.0,0.0,2291040.0
542594,2018-05-28,IN,Chennai,pressure,71,0.0,1016.6,0.0,2312180.0


In [7]:
chennai['Specie'].unique()

array(['pressure', 'co', 'o3', 'no2', 'humidity', 'wind-gust', 'so2',
       'pm25', 'wind-speed', 'temperature'], dtype=object)

In [8]:
#For Wind Speed
pm = chennai[chennai['Specie'] == 'co'].copy() #So we don't modify the original
pm.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
542637,2018-01-26,IN,Chennai,co,96,2.8,20.3,7.0,185.1
542638,2018-03-18,IN,Chennai,co,92,2.4,18.8,8.7,178.92
542639,2018-04-07,IN,Chennai,co,48,2.8,9.3,5.1,22.48
542640,2018-04-20,IN,Chennai,co,48,3.1,14.2,6.9,52.57
542641,2018-06-26,IN,Chennai,co,48,2.2,13.4,6.5,61.01


In [9]:
#Pivot table
pm = pd.pivot_table(data=pm, values=['min', 'max', 'median'], index='Date')
pm.head()

Unnamed: 0_level_0,max,median,min
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,21.7,9.3,2.9
2018-01-02,17.6,8.0,3.3
2018-01-03,18.8,10.0,2.4
2018-01-04,20.3,6.7,2.4
2018-01-05,19.6,6.7,2.4


In [10]:
pm.rename(columns={'min':'min_{}'.format('pm10'), 'max':'max_{}'.format('pm10'), 'median':'avg_{}'.format('pm10')}, inplace=True)
pm.head()

Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-01-01,21.7,9.3,2.9
2018-01-02,17.6,8.0,3.3
2018-01-03,18.8,10.0,2.4
2018-01-04,20.3,6.7,2.4
2018-01-05,19.6,6.7,2.4


After cleaning and transforming the data, we now know which steps the function has to contain. 

#### Writing a function to ease the cleaning process

In [11]:
def cleaning_function(df, city, parameters):
    #This function will make all the cleaning process
    #df is each DataFrame (df18, df19, df20)
    #City we will use
    #Parameters or environmental variables, we will take from each dataframe based
    #on it's available variables

    #Basic idea:
    #1. Extracts the city
    #2. From the list of parameters, extracts each parameter
    #3. From that parameter makes a pivot, and change columns names
    #4. Takes each parameter df and merges it with an empty df
    #5. Return that new DataFrame

    city_temp = df[df['City'] == city]
    city_temp_parameters = city_temp['Specie'].unique()
    final_df = pd.DataFrame()
    for parameter in parameters:
        if parameter not in city_temp_parameters:
            print("The parameter {} doesn't exist in the dataframe and will be ignored".format(parameter))
            continue
        else:
            temp_df = city_temp[city_temp['Specie'] == parameter].copy()
            temp_df_pivot = pd.pivot_table(data=temp_df, values=['min', 'max', 'median'], index='Date')
            temp_df_pivot.rename(columns={'min':'min_{}'.format(parameter), 'max':'max_{}'.format(parameter), 'median':'avg_{}'.format(parameter)}, inplace=True)
            final_df = pd.concat([final_df, temp_df_pivot], axis=1)
    return final_df

#### Extracting and Cleaning Data from Chennai

In [12]:
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
chennai_18 = cleaning_function(df18, 'Chennai', parameters)
chennai_18.head()

The parameter pm10 doesn't exist in the dataframe and will be ignored
The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,avg_o3,min_o3,max_so2,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,180.0,129.0,56.0,21.7,9.3,2.9,35.4,10.2,1.3,37.5,...,,,,,,,,,,
2018-01-02,834.0,145.0,13.0,17.6,8.0,3.3,33.4,4.5,1.3,37.5,...,,,,,,,,,,
2018-01-03,781.0,135.0,40.0,18.8,10.0,2.4,42.5,3.3,1.3,54.3,...,,,,,,,,,,
2018-01-04,834.0,157.0,111.0,20.3,6.7,2.4,57.4,13.0,1.3,51.5,...,,,,,,,,,,
2018-01-05,327.0,156.0,129.0,19.6,6.7,2.4,32.2,12.4,1.7,32.0,...,,,,,,,,,,


In [13]:
chennai_18.isnull().sum()

max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3               0
avg_o3               0
min_o3               0
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity       130
avg_humidity       130
min_humidity       130
max_pressure       135
avg_pressure       135
min_pressure       135
max_temperature    135
avg_temperature    135
min_temperature    135
max_wind-speed     130
avg_wind-speed     130
min_wind-speed     130
dtype: int64

In [14]:
chennai_18.drop(columns=['max_humidity', 'avg_humidity', 'min_humidity','max_pressure', 'avg_pressure', 'min_pressure', 'max_temperature', 'avg_temperature', 'min_temperature', 'max_wind-speed', 'avg_wind-speed', 'min_wind-speed'], inplace=True)

In [15]:
def missingvalue_function(city):
    city_noNan_foward = city.interpolate(method='linear',axis=1, limit_direction='forward')
    city_noNan_all = city_noNan_foward.interpolate(method='linear',axis=1,limit_direction='backward')
    #city_noNan_all.astype(int)
    return city_noNan_all

In [16]:
chennai_18 = missingvalue_function(chennai_18)
chennai_18.isnull().sum()

max_pm25    0
avg_pm25    0
min_pm25    0
max_co      0
avg_co      0
min_co      0
max_o3      0
avg_o3      0
min_o3      0
max_so2     0
avg_so2     0
min_so2     0
max_no2     0
avg_no2     0
min_no2     0
dtype: int64

#### Extracting and Cleaning Data from Mumbai

In [17]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
mumbai_18 = cleaning_function(df18, 'Mumbai', parameters)
mumbai_18.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,198.0,104.0,31.0,305.0,166.0,10.0,86.4,13.7,0.8,53.6,...,,,,,,,,,,
2018-01-02,168.0,104.0,42.0,255.0,175.0,100.0,38.4,13.8,0.6,64.7,...,,,,,,,,,,
2018-01-03,271.0,130.0,59.0,364.0,174.0,118.0,57.1,18.1,1.1,68.6,...,,,,,,,,,,
2018-01-04,317.0,122.0,38.0,362.0,180.0,106.0,58.4,22.5,0.2,47.0,...,,,,,,,,,,
2018-01-05,180.0,109.0,40.0,291.0,162.0,82.0,47.7,18.8,1.0,44.8,...,,,,,,,,,,


In [18]:
mumbai_18.isnull().sum()

max_pm10            11
avg_pm10            11
min_pm10            11
max_pm25             5
avg_pm25             5
min_pm25             5
max_co              11
avg_co              11
min_co              11
max_o3              72
avg_o3              72
min_o3              72
max_so2             11
avg_so2             11
min_so2             11
max_no2             72
avg_no2             72
min_no2             72
max_humidity       129
avg_humidity       129
min_humidity       129
max_pressure       129
avg_pressure       129
min_pressure       129
max_temperature    129
avg_temperature    129
min_temperature    129
max_wind-speed     135
avg_wind-speed     135
min_wind-speed     135
dtype: int64

In [19]:
mumbai_18.drop(columns=['max_humidity', 'avg_humidity', 'min_humidity','max_pressure', 'avg_pressure', 'min_pressure', 'max_temperature', 'avg_temperature', 'min_temperature', 'max_wind-speed', 'avg_wind-speed', 'min_wind-speed'], inplace=True)
mumbai_18 = missingvalue_function(mumbai_18)
mumbai_18.isnull().sum()

max_pm10    0
avg_pm10    0
min_pm10    0
max_pm25    0
avg_pm25    0
min_pm25    0
max_co      0
avg_co      0
min_co      0
max_o3      0
avg_o3      0
min_o3      0
max_so2     0
avg_so2     0
min_so2     0
max_no2     0
avg_no2     0
min_no2     0
dtype: int64

#### Extracting and Cleaning Data from Delhi

In [20]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
Delhi_18 = cleaning_function(df18, 'Delhi', parameters)
Delhi_18.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed,max_wd,avg_wd,min_wd
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,889.0,302.0,40.0,999.0,342.0,81.0,98.5,5.1,0.1,147.3,...,,,,,,,,,,
2018-01-02,999.0,318.0,21.0,588.0,327.0,33.0,118.9,4.0,0.1,149.0,...,,,,,,,,,,
2018-01-03,722.0,192.0,76.0,525.0,253.0,162.0,74.2,5.4,0.1,75.5,...,,,,,,,,,,
2018-01-04,859.0,226.0,20.0,999.0,269.0,53.0,149.7,7.6,0.1,37.2,...,,,,,,,,,,
2018-01-05,863.0,200.0,70.0,999.0,274.0,98.0,85.7,6.8,0.1,39.0,...,,,,,,,,,,


In [21]:
Delhi_18.isnull().sum()

max_pm10             0
avg_pm10             0
min_pm10             0
max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3               0
avg_o3               0
min_o3               0
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity       121
avg_humidity       121
min_humidity       121
max_pressure       121
avg_pressure       121
min_pressure       121
max_temperature    121
avg_temperature    121
min_temperature    121
max_wind-speed     120
avg_wind-speed     120
min_wind-speed     120
max_wd             133
avg_wd             133
min_wd             133
dtype: int64

In [22]:
Delhi_18.drop(columns=['max_humidity', 'avg_humidity', 'min_humidity','max_pressure', 'avg_pressure', 'min_pressure', 'max_temperature', 'avg_temperature', 'min_temperature', 'max_wind-speed', 'avg_wind-speed', 'min_wind-speed','max_wd','avg_wd','min_wd'], inplace=True)
Delhi_18 = missingvalue_function(Delhi_18)
Delhi_18.isnull().sum()

max_pm10    0
avg_pm10    0
min_pm10    0
max_pm25    0
avg_pm25    0
min_pm25    0
max_co      0
avg_co      0
min_co      0
max_o3      0
avg_o3      0
min_o3      0
max_so2     0
avg_so2     0
min_so2     0
max_no2     0
avg_no2     0
min_no2     0
dtype: int64

#### Extracting and Cleaning Data from Thiruvananthapuram

In [23]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
thiru_18 = cleaning_function(df18, 'Thiruvananthapuram', parameters)
thiru_18.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,80.0,65.0,55.0,164.0,154.0,112.0,14.9,12.0,9.4,43.3,...,,,,,,,,,,
2018-01-02,99.0,67.0,49.0,181.0,158.0,109.0,16.2,12.5,8.5,42.9,...,,,,,,,,,,
2018-01-03,74.0,65.0,50.0,162.0,153.0,97.0,16.3,12.9,10.2,40.7,...,,,,,,,,,,
2018-01-04,86.0,72.0,55.0,168.0,160.0,107.0,21.7,12.2,4.0,35.4,...,,,,,,,,,,
2018-01-05,124.0,82.0,61.0,181.0,163.0,152.0,20.7,11.6,5.8,32.9,...,,,,,,,,,,


In [24]:
thiru_18.isnull().sum()

max_pm10             0
avg_pm10             0
min_pm10             0
max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3               0
avg_o3               0
min_o3               0
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity       133
avg_humidity       133
min_humidity       133
max_pressure       133
avg_pressure       133
min_pressure       133
max_temperature    133
avg_temperature    133
min_temperature    133
max_wind-speed     133
avg_wind-speed     133
min_wind-speed     133
dtype: int64

In [25]:
thiru_18.drop(columns=['max_humidity', 'avg_humidity', 'min_humidity','max_pressure', 'avg_pressure', 'min_pressure', 'max_temperature', 'avg_temperature', 'min_temperature', 'max_wind-speed', 'avg_wind-speed', 'min_wind-speed'], inplace=True)


In [26]:
thiru_18 = missingvalue_function(thiru_18)
thiru_18.isnull().sum()

max_pm10    0
avg_pm10    0
min_pm10    0
max_pm25    0
avg_pm25    0
min_pm25    0
max_co      0
avg_co      0
min_co      0
max_o3      0
avg_o3      0
min_o3      0
max_so2     0
avg_so2     0
min_so2     0
max_no2     0
avg_no2     0
min_no2     0
dtype: int64

#### Extracting and Cleaning Data from Bangalore

In [27]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
bangalore_18 = cleaning_function(df18, 'Bengaluru', parameters)
bangalore_18.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,113.0,56.0,38.0,158.0,95.0,34.0,34.4,9.4,1.8,131.3,...,,,,,,,,,,
2018-01-02,101.0,57.0,36.0,162.0,95.0,70.0,23.9,9.6,0.2,102.7,...,,,,,,,,,,
2018-01-03,121.0,52.0,12.0,158.0,82.0,53.0,27.7,11.4,0.4,93.1,...,,,,,,,,,,
2018-01-04,128.0,57.0,32.0,157.0,90.0,50.0,22.1,10.0,0.2,89.5,...,,,,,,,,,,
2018-01-05,137.0,60.0,41.0,162.0,87.0,56.0,26.1,10.5,0.2,105.2,...,,,,,,,,,,


In [28]:
bangalore_18.isnull().sum()

max_pm10            72
avg_pm10            72
min_pm10            72
max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3              79
avg_o3              79
min_o3              79
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity       130
avg_humidity       130
min_humidity       130
max_pressure       130
avg_pressure       130
min_pressure       130
max_temperature    130
avg_temperature    130
min_temperature    130
max_wind-speed     130
avg_wind-speed     130
min_wind-speed     130
dtype: int64

In [29]:
bangalore_18.drop(columns=['max_humidity', 'avg_humidity', 'min_humidity','max_pressure', 'avg_pressure', 'min_pressure', 'max_temperature', 'avg_temperature', 'min_temperature', 'max_wind-speed', 'avg_wind-speed', 'min_wind-speed'], inplace=True)

In [30]:
bangalore_18 = missingvalue_function(thiru_18)
bangalore_18.isnull().sum()

max_pm10    0
avg_pm10    0
min_pm10    0
max_pm25    0
avg_pm25    0
min_pm25    0
max_co      0
avg_co      0
min_co      0
max_o3      0
avg_o3      0
min_o3      0
max_so2     0
avg_so2     0
min_so2     0
max_no2     0
avg_no2     0
min_no2     0
dtype: int64

### Data Cleaning 2019 DataFrame

In [32]:
quarters = ['2019a.csv', '2019b.csv', '2019c.csv', '2019d.csv']

df19 = pd.DataFrame()
for quarter in quarters:
  df = pd.read_csv('{}'.format(quarter),parse_dates=['Date'])
  df19 = pd.concat([df19, df])

df19.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,2019-01-16,AE,Abu Dhabi,pm10,24,86.0,99.0,97.0,179.4
1,2019-01-22,AE,Abu Dhabi,pm10,24,51.0,57.0,55.0,23.75
2,2019-01-26,AE,Abu Dhabi,pm10,24,136.0,173.0,160.0,941.96
3,2019-01-07,AE,Abu Dhabi,pm10,24,60.0,91.0,72.0,1006.88
4,2019-01-10,AE,Abu Dhabi,pm10,24,82.0,93.0,87.0,57.97


#### Extracting and Cleaning Data from Chennai (2019)

In [54]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
chennai_19 = cleaning_function(df19, 'Chennai', parameters)
chennai_19.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-31,,,,191.0,132.0,61.0,72.2,8.0,2.3,43.0,...,31.7,1020.0,1017.5,1016.0,28.1,25.1,22.0,3.3,1.1,0.1
2019-01-01,,,,834.0,135.0,51.0,28.2,7.2,2.3,46.5,...,27.1,1021.0,1018.0,1016.0,28.1,23.4,19.5,1.6,0.7,0.0
2019-01-02,,,,201.0,117.0,44.0,34.1,7.9,2.3,40.9,...,30.3,1020.0,1018.0,1016.0,29.1,24.0,20.0,1.2,0.6,0.0
2019-01-03,,,,263.0,161.0,72.0,24.5,9.4,2.2,46.6,...,31.7,1020.0,1018.0,1016.0,29.5,23.6,20.1,1.8,0.6,0.0
2019-01-04,,,,311.0,153.0,23.0,28.7,6.6,2.3,40.2,...,33.4,1020.0,1017.0,1016.0,29.6,24.0,21.0,1.7,0.7,0.0


In [55]:
chennai_19.isnull().sum()

max_pm10           320
avg_pm10           320
min_pm10           320
max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3               3
avg_o3               3
min_o3               3
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity         4
avg_humidity         4
min_humidity         4
max_pressure         4
avg_pressure         4
min_pressure         4
max_temperature      4
avg_temperature      4
min_temperature      4
max_wind-speed       4
avg_wind-speed       4
min_wind-speed       4
dtype: int64

In [56]:
chennai_19.drop(columns=['max_pm10', 'avg_pm10','min_pm10'], inplace=True)

In [61]:
chennai_19.isnull().sum()

max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

In [60]:
chennai_19 = missingvalue_function(chennai_19)
chennai_19.isnull().sum()

max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Mumbai (2019)

In [39]:
#Apply the function
#Check for missing data
#Fill missing data

parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
mumbai_19 = cleaning_function(df19, 'Mumbai', parameters)
mumbai_19.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-31,194.0,110.0,34.0,205.0,165.0,109.0,40.0,16.7,3.3,30.5,...,12.4,1020.0,1017.0,1015.0,32.9,24.6,14.1,4.8,0.6,0.0
2019-01-01,269.0,122.0,38.0,297.0,169.0,124.0,41.2,15.4,5.5,32.8,...,13.0,1019.5,1017.0,1015.0,33.4,24.1,13.7,5.1,0.5,0.0
2019-01-02,323.0,132.0,54.0,261.0,172.0,132.0,42.0,19.5,3.2,46.6,...,12.9,1019.0,1016.0,1014.0,34.5,23.7,14.1,4.6,0.4,0.0
2019-01-03,881.0,155.0,79.0,323.0,183.0,149.0,51.4,26.6,4.1,43.8,...,15.7,1020.0,1017.0,1015.0,34.2,24.1,15.0,4.1,0.5,0.0
2019-01-04,313.0,136.0,45.0,297.0,176.0,143.0,56.0,22.5,3.1,36.3,...,16.2,1018.0,1016.0,1014.0,36.0,24.9,15.5,4.8,0.5,0.0


In [41]:
mumbai_19.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       4
avg_humidity       4
min_humidity       4
max_pressure       4
avg_pressure       4
min_pressure       4
max_temperature    4
avg_temperature    4
min_temperature    4
max_wind-speed     4
avg_wind-speed     4
min_wind-speed     4
dtype: int64

In [53]:
mumbai_19 = missingvalue_function(mumbai_19)
mumbai_19.isnull().sum()

max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Delhi (2019)

In [45]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
delhi_19 = cleaning_function(df19, 'Delhi', parameters)
delhi_19.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_precipitation,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed,max_wd,avg_wd,min_wd
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-31,999.0,380.0,71.0,923.0,363.0,151.0,105.3,21.5,0.1,61.9,...,-2390.1,49.9,12.3,0.9,14.8,0.7,0.0,357.2,225.3,15.2
2019-01-01,999.0,354.0,91.0,898.0,341.0,95.0,116.8,17.6,0.1,77.8,...,-2072.1,48.1,13.0,1.0,6.9,0.6,0.0,351.5,211.4,0.0
2019-01-02,999.0,405.0,128.0,999.0,383.0,159.0,138.9,19.9,0.1,100.1,...,-2432.7,33.1,13.8,3.7,8.0,0.6,0.0,344.2,166.5,15.9
2019-01-03,999.0,421.0,111.0,999.0,410.0,70.0,133.8,18.5,0.1,69.0,...,-2995.8,33.0,13.0,4.3,7.7,1.0,0.0,350.0,232.0,9.4
2019-01-04,941.0,220.0,114.0,720.0,289.0,77.0,91.0,15.0,0.2,86.9,...,-2333.2,33.2,12.3,4.2,6.5,0.8,0.0,348.0,206.1,12.8


In [46]:
delhi_19.isnull().sum()

max_pm10               0
avg_pm10               0
min_pm10               0
max_pm25               0
avg_pm25               0
min_pm25               0
max_co                 0
avg_co                 0
min_co                 0
max_o3                 0
avg_o3                 0
min_o3                 0
max_so2                0
avg_so2                0
min_so2                0
max_no2                0
avg_no2                0
min_no2                0
max_humidity           0
avg_humidity           0
min_humidity           0
max_pressure           0
avg_pressure           0
min_pressure           0
max_precipitation    286
avg_precipitation    286
min_precipitation    286
max_temperature        0
avg_temperature        0
min_temperature        0
max_wind-speed         0
avg_wind-speed         0
min_wind-speed         0
max_wd                 4
avg_wd                 4
min_wd                 4
dtype: int64

In [47]:
delhi_19.drop(columns=['max_precipitation', 'avg_precipitation','min_precipitation'], inplace=True)

In [49]:
delhi_19 = missingvalue_function(delhi_19)
delhi_19.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
max_wd             0
avg_wd             0
min_wd             0
dtype: int64

#### Extracting and Cleaning Data from Thiruvananthapuram (2019)

In [50]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
thiru_19 = cleaning_function(df19, 'Thiruvananthapuram', parameters)
thiru_19.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-31,120.0,64.0,54.0,187.0,145.0,105.0,75.6,31.6,25.8,47.5,...,53.2,1015.0,1013.0,1011.0,31.4,30.4,30.4,2.9,0.9,0.3
2019-01-01,125.0,62.0,53.0,197.0,130.0,100.0,85.0,65.7,1.0,55.2,...,51.2,1016.0,1014.0,1011.0,31.3,30.4,30.3,3.3,1.0,0.4
2019-01-02,124.0,62.0,55.0,185.0,151.0,102.0,7.8,2.5,2.4,57.6,...,51.5,1015.0,1014.0,1012.0,31.3,30.5,30.2,3.0,0.7,0.3
2019-01-03,115.0,59.0,51.0,188.0,120.0,91.0,6.6,1.5,0.6,58.4,...,45.7,1016.0,1014.0,1011.5,31.1,30.5,30.4,3.0,0.7,0.3
2019-01-04,105.0,56.0,45.0,176.0,101.0,72.0,6.3,2.1,1.9,56.1,...,41.5,1015.0,1013.0,1011.0,31.0,30.5,29.6,3.1,0.9,0.4


In [51]:
thiru_19.isnull().sum()

max_pm10            7
avg_pm10            7
min_pm10            7
max_pm25           10
avg_pm25           10
min_pm25           10
max_co             10
avg_co             10
min_co             10
max_o3              2
avg_o3              2
min_o3              2
max_so2             2
avg_so2             2
min_so2             2
max_no2             2
avg_no2             2
min_no2             2
max_humidity       10
avg_humidity       10
min_humidity       10
max_pressure       12
avg_pressure       12
min_pressure       12
max_temperature    10
avg_temperature    10
min_temperature    10
max_wind-speed     10
avg_wind-speed     10
min_wind-speed     10
dtype: int64

In [52]:
thiru_19 = missingvalue_function(thiru_19)
thiru_19.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Bangalore (2019)

In [62]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
bangalore_19 = cleaning_function(df19, 'Bengaluru', parameters)
bangalore_19.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_pressure,max_precipitation,avg_precipitation,min_precipitation,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-31,167.0,72.0,40.0,834.0,148.0,2.0,39.5,6.7,1.0,94.4,...,913.0,,,,30.3,23.8,15.0,3.6,1.6,0.1
2019-01-01,133.0,73.0,40.0,251.0,153.0,41.0,36.0,7.7,1.2,70.5,...,913.8,,,,30.6,24.1,12.5,38.0,1.2,0.1
2019-01-02,131.0,65.0,30.0,186.0,112.0,3.0,35.9,7.0,0.9,59.3,...,913.5,,,,35.2,24.2,11.0,6.9,1.5,0.2
2019-01-03,111.0,67.0,32.0,178.0,129.0,38.0,33.2,6.7,0.2,60.6,...,914.8,,,,31.1,24.1,9.5,7.4,1.5,0.2
2019-01-04,135.0,70.0,40.0,188.0,123.0,35.0,36.0,7.8,0.1,47.2,...,913.5,,,,37.7,24.3,8.5,6.4,1.2,0.0


In [63]:
bangalore_19.isnull().sum()

max_pm10               0
avg_pm10               0
min_pm10               0
max_pm25               0
avg_pm25               0
min_pm25               0
max_co                 0
avg_co                 0
min_co                 0
max_o3                 0
avg_o3                 0
min_o3                 0
max_so2                0
avg_so2                0
min_so2                0
max_no2                0
avg_no2                0
min_no2                0
max_humidity           4
avg_humidity           4
min_humidity           4
max_pressure           4
avg_pressure           4
min_pressure           4
max_precipitation    369
avg_precipitation    369
min_precipitation    369
max_temperature        4
avg_temperature        4
min_temperature        4
max_wind-speed         4
avg_wind-speed         4
min_wind-speed         4
dtype: int64

In [64]:
bangalore_19.drop(columns=['max_precipitation', 'avg_precipitation','min_precipitation'], inplace=True)

In [66]:
bangalore_19 = missingvalue_function(bangalore_19)
bangalore_19.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

### Data Cleaning 2020 DataFrame

In [67]:
df20 = pd.read_csv('2020.csv',parse_dates=['Date'])
df20.head()

Unnamed: 0,Date,Country,City,Specie,count,min,max,median,variance
0,2020-04-20,GR,Thessaloníki,pm25,47,21.0,68.0,53.0,1084.49
1,2020-07-24,GR,Thessaloníki,pm25,30,25.0,68.0,38.0,1246.54
2,2020-09-24,GR,Thessaloníki,pm25,47,13.0,76.0,38.0,3197.67
3,2020-01-30,GR,Thessaloníki,pm25,48,13.0,65.0,25.0,2094.45
4,2020-03-05,GR,Thessaloníki,pm25,44,5.0,61.0,25.0,2441.03


In [68]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
chennai_20 = cleaning_function(df20, 'Chennai', parameters)
chennai_20.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30,60.0,42.0,25.0,173.0,111.0,21.0,10.9,4.0,2.6,30.7,...,,,,,,,,,,
2019-12-31,69.0,51.0,32.0,167.0,107.0,6.0,15.7,3.9,2.5,26.0,...,,,,,,,,,,
2020-01-01,63.0,33.0,16.0,157.0,68.0,7.0,9.7,3.9,2.1,17.9,...,,,,,,,,,,
2020-01-02,38.0,19.0,6.0,180.0,63.0,4.0,14.0,4.0,2.7,15.9,...,,,,,,,,,,
2020-01-03,53.0,28.0,18.0,167.0,72.0,14.0,86.3,3.9,2.6,23.0,...,61.7,1015.0,1008.8,760.6,31.0,28.7,26.3,7.2,1.6,0.0


#### Extracting and Cleaning Data from Chennai (2020)

In [69]:
chennai_20.isnull().sum()

max_pm10           92
avg_pm10           92
min_pm10           92
max_pm25            0
avg_pm25            0
min_pm25            0
max_co              0
avg_co              0
min_co              0
max_o3              1
avg_o3              1
min_o3              1
max_so2             0
avg_so2             0
min_so2             0
max_no2             0
avg_no2             0
min_no2             0
max_humidity       19
avg_humidity       19
min_humidity       19
max_pressure       19
avg_pressure       19
min_pressure       19
max_temperature    19
avg_temperature    19
min_temperature    19
max_wind-speed     88
avg_wind-speed     88
min_wind-speed     88
dtype: int64

In [70]:
chennai_20 = missingvalue_function(chennai_20)
chennai_20.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Mumbai (2020)

In [71]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
mumbai_20 = cleaning_function(df20, 'Mumbai', parameters)
mumbai_20.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30,189.0,97.0,32.0,191.0,168.0,57.0,22.7,9.2,3.7,71.3,...,,,,,,,,,,
2019-12-31,183.0,84.0,19.0,206.0,157.0,37.0,22.2,7.6,2.4,83.8,...,,,,,,,,,,
2020-01-01,286.0,111.0,51.0,824.0,179.0,66.0,29.0,11.2,2.5,118.2,...,,,,,,,,,,
2020-01-02,195.0,104.0,37.0,271.0,180.0,66.0,23.6,8.9,1.0,86.1,...,,,,,,,,,,
2020-01-03,827.0,95.0,2.0,800.0,170.0,88.0,23.5,8.1,2.6,130.3,...,38.6,1016.5,1012.8,769.3,33.2,25.0,21.0,71.7,1.5,0.0


In [72]:
mumbai_20.isnull().sum()

max_pm10            0
avg_pm10            0
min_pm10            0
max_pm25            0
avg_pm25            0
min_pm25            0
max_co              0
avg_co              0
min_co              0
max_o3              0
avg_o3              0
min_o3              0
max_so2             0
avg_so2             0
min_so2             0
max_no2             0
avg_no2             0
min_no2             0
max_humidity       18
avg_humidity       18
min_humidity       18
max_pressure       19
avg_pressure       19
min_pressure       19
max_temperature    19
avg_temperature    19
min_temperature    19
max_wind-speed     88
avg_wind-speed     88
min_wind-speed     88
dtype: int64

In [73]:
mumbai_20 = missingvalue_function(mumbai_20)
mumbai_20.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Delhi (2020)

In [76]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
delhi_20 = cleaning_function(df20, 'Delhi', parameters)
delhi_20.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_precipitation,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed,max_wd,avg_wd,min_wd
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30,825.0,353.0,25.0,678.0,356.0,43.0,61.6,19.1,0.1,77.9,...,,6.3,6.2,5.5,2.2,2.1,1.5,,,
2019-12-31,999.0,164.0,14.0,999.0,248.0,20.0,158.1,12.0,0.1,39.9,...,,12.1,7.2,5.0,2.2,1.7,0.4,,,
2020-01-01,999.0,342.0,20.0,999.0,392.0,102.0,148.2,22.6,0.1,153.1,...,,16.8,10.6,4.2,2.6,0.8,0.5,,,
2020-01-02,999.0,305.0,1.0,920.0,349.0,101.0,134.1,24.3,0.1,137.7,...,,22.3,15.5,7.6,1.6,0.7,0.5,,,
2020-01-03,999.0,187.0,3.0,729.0,255.0,65.0,85.5,20.3,0.1,199.2,...,,34.8,18.3,7.4,9.9,0.9,0.1,336.4,159.8,25.8


In [77]:
delhi_20.isnull().sum()

max_pm10               0
avg_pm10               0
min_pm10               0
max_pm25               0
avg_pm25               0
min_pm25               0
max_co                 0
avg_co                 0
min_co                 0
max_o3                 0
avg_o3                 0
min_o3                 0
max_so2                0
avg_so2                0
min_so2                0
max_no2                0
avg_no2                0
min_no2                0
max_humidity          14
avg_humidity          14
min_humidity          14
max_pressure          14
avg_pressure          14
min_pressure          14
max_precipitation    211
avg_precipitation    211
min_precipitation    211
max_temperature       14
avg_temperature       14
min_temperature       14
max_wind-speed        84
avg_wind-speed        84
min_wind-speed        84
max_wd                18
avg_wd                18
min_wd                18
dtype: int64

In [78]:
delhi_20 = missingvalue_function(delhi_20)
delhi_20.isnull().sum()

max_pm10             0
avg_pm10             0
min_pm10             0
max_pm25             0
avg_pm25             0
min_pm25             0
max_co               0
avg_co               0
min_co               0
max_o3               0
avg_o3               0
min_o3               0
max_so2              0
avg_so2              0
min_so2              0
max_no2              0
avg_no2              0
min_no2              0
max_humidity         0
avg_humidity         0
min_humidity         0
max_pressure         0
avg_pressure         0
min_pressure         0
max_precipitation    0
avg_precipitation    0
min_precipitation    0
max_temperature      0
avg_temperature      0
min_temperature      0
max_wind-speed       0
avg_wind-speed       0
min_wind-speed       0
max_wd               0
avg_wd               0
min_wd               0
dtype: int64

#### Extracting and Cleaning Data from Thiruvananthapuram (2020)

In [79]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
thiru_20 = cleaning_function(df20, 'Thiruvananthapuram', parameters)
thiru_20.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30,85.0,57.0,40.0,170.0,115.0,13.0,24.2,16.9,7.1,29.2,...,,,,,,,,,,
2019-12-31,70.0,55.0,38.0,167.0,113.0,56.0,21.6,12.7,8.6,30.8,...,,,,,,,,,,
2020-01-01,68.0,55.0,43.0,156.0,100.0,35.0,17.1,12.2,8.0,28.9,...,,,,,,,,,,
2020-01-02,78.0,53.0,33.0,159.0,99.0,36.0,13.4,9.9,2.9,22.8,...,,,,,,,,,,
2020-01-03,75.0,53.0,31.0,159.0,104.0,25.0,13.5,6.3,1.5,27.1,...,62.7,673.3,641.6,628.7,31.3,28.9,27.8,3.4,2.1,0.5


In [81]:
thiru_20.isnull().sum()

max_pm10            2
avg_pm10            2
min_pm10            2
max_pm25            1
avg_pm25            1
min_pm25            1
max_co              0
avg_co              0
min_co              0
max_o3              0
avg_o3              0
min_o3              0
max_so2             0
avg_so2             0
min_so2             0
max_no2             0
avg_no2             0
min_no2             0
max_humidity       19
avg_humidity       19
min_humidity       19
max_pressure       19
avg_pressure       19
min_pressure       19
max_temperature    19
avg_temperature    19
min_temperature    19
max_wind-speed     88
avg_wind-speed     88
min_wind-speed     88
dtype: int64

In [82]:
thiru_20 = missingvalue_function(thiru_20)
thiru_20.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64

#### Extracting and Cleaning Data from Bangalore (2020)

In [83]:
#Apply the function
#Check for missing data
#Fill missing data
parameters = ['pm10', 'pm25', 'pm1', 'co', 'o3', 'so2', 'no2', 'humidity', 'pressure', 'precipitation',
'temperature', 'wind-speed', 'wd', 'uvi']
bangalore_20 = cleaning_function(df20, 'Bengaluru', parameters)
bangalore_20.head()

The parameter pm1 doesn't exist in the dataframe and will be ignored
The parameter precipitation doesn't exist in the dataframe and will be ignored
The parameter wd doesn't exist in the dataframe and will be ignored
The parameter uvi doesn't exist in the dataframe and will be ignored


Unnamed: 0_level_0,max_pm10,avg_pm10,min_pm10,max_pm25,avg_pm25,min_pm25,max_co,avg_co,min_co,max_o3,...,min_humidity,max_pressure,avg_pressure,min_pressure,max_temperature,avg_temperature,min_temperature,max_wind-speed,avg_wind-speed,min_wind-speed
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30,93.0,63.0,24.0,538.0,109.0,36.0,28.6,8.6,1.6,47.4,...,,,,,,,,,,
2019-12-31,97.0,64.0,27.0,158.0,120.0,48.0,29.7,8.9,0.4,70.9,...,,,,,,,,,,
2020-01-01,94.0,57.0,30.0,157.0,89.0,30.0,40.2,9.8,1.2,32.1,...,,,,,,,,,,
2020-01-02,127.0,52.0,6.0,190.0,77.0,8.0,60.3,9.3,0.2,48.5,...,,,,,,,,,,
2020-01-03,103.0,56.0,27.0,169.0,80.0,23.0,86.4,8.0,0.7,48.0,...,13.7,1019.5,725.2,713.0,37.3,26.5,17.4,11.2,1.0,0.3


In [84]:
bangalore_20.isnull().sum()

max_pm10            0
avg_pm10            0
min_pm10            0
max_pm25            0
avg_pm25            0
min_pm25            0
max_co              0
avg_co              0
min_co              0
max_o3              0
avg_o3              0
min_o3              0
max_so2             0
avg_so2             0
min_so2             0
max_no2             0
avg_no2             0
min_no2             0
max_humidity       18
avg_humidity       18
min_humidity       18
max_pressure       18
avg_pressure       18
min_pressure       18
max_temperature    18
avg_temperature    18
min_temperature    18
max_wind-speed     88
avg_wind-speed     88
min_wind-speed     88
dtype: int64

In [85]:
bangalore_20 = missingvalue_function(bangalore_20)
bangalore_20.isnull().sum()

max_pm10           0
avg_pm10           0
min_pm10           0
max_pm25           0
avg_pm25           0
min_pm25           0
max_co             0
avg_co             0
min_co             0
max_o3             0
avg_o3             0
min_o3             0
max_so2            0
avg_so2            0
min_so2            0
max_no2            0
avg_no2            0
min_no2            0
max_humidity       0
avg_humidity       0
min_humidity       0
max_pressure       0
avg_pressure       0
min_pressure       0
max_temperature    0
avg_temperature    0
min_temperature    0
max_wind-speed     0
avg_wind-speed     0
min_wind-speed     0
dtype: int64