#### Create dataframes from API data

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
import os

In [4]:
# read a file to investigate its format
df = pd.read_csv('air_quality_macrox/api_data_IN/api_data_IN_Agartala_raw.csv')
df

Unnamed: 0.1,Unnamed: 0,City,Lat,Lng,Date,CO,NO2,SO2,PM2_5
0,0,Agartala,23.834,91.288,2020-11-24,817.300000,10.467143,4.810000,76.442857
1,1,Agartala,23.834,91.288,2020-11-25,1083.970833,23.741250,3.515417,90.695417
2,2,Agartala,23.834,91.288,2020-11-26,1151.562500,24.502500,3.657500,96.861250
3,3,Agartala,23.834,91.288,2020-11-27,903.448750,23.724167,2.985000,75.847917
4,4,Agartala,23.834,91.288,2020-11-28,981.610417,20.355833,2.198750,102.716667
...,...,...,...,...,...,...,...,...,...
603,603,Agartala,23.834,91.288,2022-07-21,309.308333,3.673333,2.498333,9.945000
604,604,Agartala,23.834,91.288,2022-07-22,316.262917,5.353333,2.021250,5.055000
605,605,Agartala,23.834,91.288,2022-07-23,297.904167,3.725833,1.447917,5.599167
606,606,Agartala,23.834,91.288,2022-07-24,296.513750,4.888333,1.558333,4.281250


In [5]:
# create the date series to be added to the series list for each gas
series_date = df['Date']
series_date.rename('date', inplace=True)

0      2020-11-24
1      2020-11-25
2      2020-11-26
3      2020-11-27
4      2020-11-28
          ...    
603    2022-07-21
604    2022-07-22
605    2022-07-23
606    2022-07-24
607    2022-07-25
Name: date, Length: 608, dtype: object

In [6]:
# define path to Indian api data directories
# assumption: initial current working directory has "air_quality_macrox" in it
path = "air_quality_macrox/api_data_IN"
# make working directory "official_air_pollution_IN"
os.chdir(path)

In [7]:
# create a city dictionary with cities as keys and index position in df as values
cities = {}

# create time series for each gas for each of the 54 cities
# initialize list of series for NO2, SO2, and PM2.5
no2_series = []
so2_series = []
pm_series  = []
co_series =  []

for i, file in enumerate(os.listdir(os.getcwd())):
    # load csv
    df = pd.read_csv(file)
    #extract the city name
    city = df['City'][1]
    assert type(city) == str, f'city, {city}, name not extracted as string'
    # create dictionary entry for the city/index pair
    cities[city] = i+1
    # add the time series for each of the four gases to their corresponding series list
    # also rename the series to include the city name
    no2_series.append(df['NO2'].rename(f'NO2_{city}', inplace=True))
    so2_series.append(df['SO2'].rename(f'SO2_{city}', inplace=True))
    pm_series.append(df['PM2_5'].rename(f'PM2_5_{city}', inplace=True))
    co_series.append(df['CO'].rename(f'CO_{city}', inplace=True))

In [8]:
# add date column to beginning of each series
no2_series.insert(0,series_date)
so2_series.insert(0,series_date)
pm_series.insert(0,series_date)
co_series.insert(0,series_date)
# create dataframes, concatenating all the individual pandas series
df_no2_api = pd.concat(no2_series, axis=1)
df_so2_api = pd.concat(so2_series, axis=1)
df_pm_api = pd.concat(pm_series, axis=1)
df_co_api = pd.concat(co_series, axis=1)

In [9]:
# convert date column to datetime format
df_no2_api['date'] = pd.to_datetime(df_no2_api['date'])
df_so2_api['date'] = pd.to_datetime(df_so2_api['date'])
df_pm_api['date'] = pd.to_datetime(df_pm_api['date'])
df_co_api['date'] = pd.to_datetime(df_co_api['date'])

In [10]:
# change back to the original working directory
os.chdir('..')
os.chdir('..')

In [11]:
# export csv's for each gas from api data
df_no2_api.to_csv('no2_api.csv', encoding='utf-8', index=False)
df_so2_api.to_csv('so2_api.csv', encoding='utf-8', index=False)
df_pm_api.to_csv('pm_api.csv', encoding='utf-8', index=False)
df_co_api.to_csv('co_api.csv', encoding='utf-8', index=False)

In [12]:
df_no2_api_train = df_no2_api[df_no2_api['date'] <= '2021-12-31']
df_so2_api_train = df_so2_api[df_so2_api['date'] <= '2021-12-31']
df_pm_api_train = df_pm_api[df_pm_api['date'] <= '2021-12-31']
df_co_api_train = df_co_api[df_co_api['date'] <= '2021-12-31']

In [13]:
df_no2_api_validate = df_no2_api[df_no2_api['date'] >= '2022-1-1']
df_so2_api_validate = df_so2_api[df_so2_api['date'] >= '2022-1-1']
df_pm_api_validate = df_pm_api[df_pm_api['date'] >= '2022-1-1']
df_co_api_validate = df_co_api[df_co_api['date'] >= '2022-1-1']

In [14]:
# load the csv's from R after missing data imputation by matrix completion
dft = pd.read_csv('training_no_na.csv')
dfv = pd.read_csv('validation_no_na.csv')
dfg = pd.read_csv('generate_api_data_no_na.csv')
dfv.tail()

Unnamed: 0.1,Unnamed: 0,Agra_NO2_1,Amritsar_NO2_1,Aurangabad_NO2_1,Bengaluru_NO2_6,Bengaluru_NO2_7,Bengaluru_NO2_8,Bengaluru_NO2_9,Bengaluru_NO2_10,Chennai_NO2_1,...,Pondicherry_y,Prayagraj_y,Pune_y,Shillong_y,Solapur_y,Srinagar_y,Thiruvananthapuram_y,Varanasi_y,Vijaywada_y,Visakhapatnam_y
228,2022-08-17,2.35,38.59,11.69,19.63,36.318328,27.62336,16.46,3.36,25.91,...,0.003421,0.01117,0.06856,-0.005063,0.046373,-0.010343,0.020617,0.053016,0.038123,0.098989
229,2022-08-18,0.36,46.98,11.81,19.48,37.044091,10.42,16.48,1.86,29.44,...,0.001656,0.012993,0.073714,-0.009184,0.057647,-0.00371,0.012996,0.03335,0.05475,0.094852
230,2022-08-19,0.39,47.57,12.912578,19.43,37.210666,10.37,16.41,4.27,32.31,...,0.001497,0.009244,0.076975,-0.009243,0.060112,-0.00557,0.016082,0.029499,0.050409,0.101975
231,2022-08-20,0.95,46.25,12.996354,18.64,36.93259,10.44,16.41,2.46,32.41,...,0.011811,0.02385,0.071612,-0.010488,0.071548,0.002415,0.036514,0.0553,0.040314,0.101298
232,2022-08-21,1.79,42.54,13.035028,17.93,35.834462,9.57,16.44,7.809911,32.05,...,0.012361,0.012177,0.06478,-0.010466,0.06366,0.005812,0.030866,0.040396,0.053978,0.105919


In [15]:
# change name of first column for each dataframe
dft = dft.rename({'Unnamed: 0':'date'}, axis=1)
dfv = dfv.rename({'Unnamed: 0':'date'}, axis=1)
dfg = dfg.rename({'Unnamed: 0':'date'}, axis=1)
# change type of 'date' column to datetime
dft['date'] = pd.to_datetime(dft['date'])
dfv['date'] = pd.to_datetime(dfv['date'])
dfg['date'] = pd.to_datetime(dfg['date'])

In [16]:
# Since the API data ends at July 25, 2022, only include data before
# that date for the validation data
dfv = dfv[dfv['date'] <= '2022-7-25']
dfv.tail()

Unnamed: 0,date,Agra_NO2_1,Amritsar_NO2_1,Aurangabad_NO2_1,Bengaluru_NO2_6,Bengaluru_NO2_7,Bengaluru_NO2_8,Bengaluru_NO2_9,Bengaluru_NO2_10,Chennai_NO2_1,...,Pondicherry_y,Prayagraj_y,Pune_y,Shillong_y,Solapur_y,Srinagar_y,Thiruvananthapuram_y,Varanasi_y,Vijaywada_y,Visakhapatnam_y
201,2022-07-21,7.77,65.09,12.38,41.51,35.046474,36.73,17.0,9.23,25.82,...,0.034809,-3.7e-05,0.042129,-0.050574,0.057024,0.003666,-0.053389,0.012153,0.32,0.101424
202,2022-07-22,6.1,60.03,12.29,41.39,35.29379,44.83,16.98,2.98,27.35,...,0.04862,0.002218,0.042865,-0.025603,0.055913,0.001504,0.029816,0.05815,0.074477,0.08764
203,2022-07-23,3.62,54.08,12.67,41.36,35.33037,37.87,16.97,2.71,29.67,...,0.033533,0.129945,0.059933,0.02,0.048035,0.000113,-0.035568,0.05,0.078757,-0.130932
204,2022-07-24,3.71,45.6,13.02,41.6,35.010659,34.49,17.0,2.99,29.77,...,0.018444,0.028949,0.062733,-0.022027,0.038448,-0.004171,0.019551,0.12,0.062648,0.076929
205,2022-07-25,4.86,50.02,12.95,41.4,35.340249,35.67,16.94,2.6,34.61,...,0.206667,0.010393,0.05981,-0.020632,0.046641,-0.005901,0.013318,0.061039,-0.22,0.081795


In [17]:
def pull_out_city_name_before_(city):
    i = city.find('_')
    return city[:i]

In [18]:
def pull_out_city_name_after_(city):
    i = city.find('_')
    return city[i+1:]

In [19]:
# Pull out unique city names from dataframes for training set and response api data
city_list_dft = set([pull_out_city_name_before_(city) for city in list(dft.columns)[1:]])
city_list_api = set([pull_out_city_name_after_(city) for city in list(df_no2_api.columns)[1:]])

In [20]:
# investigate and resolve differences in the city names
city_list_api.symmetric_difference(city_list_dft)

{'Mangalore', 'Mangaluru', 'NewDelhi', 'Pondicherry', 'Puducherry'}

In [21]:
[label for label in list(dft.columns) if 'Mangaluru' in label]

['Mangaluru_x', 'Mangaluru_y']

In [22]:
dft = dft.rename({'Mangaluru_x':'Mangalore_x', 'Mangaluru_y':'Mangalore_y'}, axis=1)
dfv = dfv.rename({'Mangaluru_x':'Mangalore_x', 'Mangaluru_y':'Mangalore_y'}, axis=1)
dfg = dfg.rename({'Mangaluru_x':'Mangalore_x', 'Mangaluru_y':'Mangalore_y'}, axis=1)

In [23]:
[label for label in list(dft.columns) if 'NewDelhi' in label]

['NewDelhi_x', 'NewDelhi_y']

In [24]:
dft = dft.rename({'NewDelhi_x':'Delhi_x', 'NewDelhi_y':'Delhi_y'}, axis=1)
dfv = dfv.rename({'NewDelhi_x':'Delhi_x', 'NewDelhi_y':'Delhi_y'}, axis=1)
dfg = dfg.rename({'NewDelhi_x':'Delhi_x', 'NewDelhi_y':'Delhi_y'}, axis=1)

In [25]:
[label for label in list(dft.columns) if 'Pondicherry' in label]

['Pondicherry_x', 'Pondicherry_y']

In [26]:
dft = dft.rename({'Pondicherry_x':'Puducherry_x', 'Pondicherry_y':'Puducherry_y'}, axis=1)
dfv = dfv.rename({'Pondicherry_x':'Puducherry_x', 'Pondicherry_y':'Puducherry_y'}, axis=1)
dfg = dfg.rename({'Pondicherry_x':'Puducherry_x', 'Pondicherry_y':'Puducherry_y'}, axis=1)

In [27]:
[label for label in list(dft.columns) if 'Vijayawada' in label]

[]

In [28]:
dft = dft.rename({'Vijayawada_NO2_1':'Vijaywada_NO2_1', 'Vijayawada_SO2_1':'Vijaywada_SO2_1',
                  'Vijayawada_PM2.5_1':'Vijaywada_PM2.5_1'}, axis=1)
dfv = dfv.rename({'Vijayawada_NO2_1':'Vijaywada_NO2_1', 'Vijayawada_SO2_1':'Vijaywada_SO2_1',
                  'Vijayawada_PM2.5_1':'Vijaywada_PM2.5_1'}, axis=1)
dfg = dfg.rename({'Vijayawada_NO2_1':'Vijaywada_NO2_1', 'Vijayawada_SO2_1':'Vijaywada_SO2_1',
                  'Vijayawada_PM2.5_1':'Vijaywada_PM2.5_1'}, axis=1)

In [29]:
# check to see if city name discrepencies have been resolved
new_city_list_dft = set([pull_out_city_name_before_(city) for city in list(dft.columns)[1:]])
city_list_api.symmetric_difference(new_city_list_dft)

set()

In [30]:
dfv.shape

(206, 264)

In [31]:
df_no2_api_validate.shape

(205, 55)

In [32]:
# determine the one row discrepency between api response and training data
print(set(list(dfv['date'])).symmetric_difference(set(list(df_no2_api_validate['date']))))
print(set(list(dfv['date'])).symmetric_difference(set(list(df_so2_api_validate['date']))))
print(set(list(dfv['date'])).symmetric_difference(set(list(df_pm_api_validate['date']))))
print(set(list(dfv['date'])).symmetric_difference(set(list(df_co_api_validate['date']))))

{Timestamp('2022-07-20 00:00:00')}
{Timestamp('2022-07-20 00:00:00')}
{Timestamp('2022-07-20 00:00:00')}
{Timestamp('2022-07-20 00:00:00')}


In [33]:
# remove july 20 from training data
dfv = dfv[dfv['date'] != '2022-07-20']
print(dfv.shape)
print(set(list(dfv['date'])).symmetric_difference(set(list(df_no2_api_validate['date']))))

(205, 264)
set()


In [34]:
# how to create X dataframe for random forest-example
dft.filter(like='Lucknow', axis=1).to_numpy()

array([[ 1.59400000e+02,  2.16800000e+02,  1.59400000e+02, ...,
         1.59400000e+02,  2.16800000e+02,  1.58284733e-01],
       [ 1.64930000e+02,  2.33490000e+02,  1.64930000e+02, ...,
         1.64930000e+02,  2.33490000e+02,  1.01468056e-01],
       [ 1.71480000e+02,  2.36760000e+02,  1.71480000e+02, ...,
         1.71480000e+02,  2.36760000e+02, -2.19999999e-01],
       ...,
       [ 9.73000000e+01,  7.11700000e+01,  9.73000000e+01, ...,
         9.73000000e+01,  7.11700000e+01,  3.75316787e-02],
       [ 7.37900000e+01,  7.93100000e+01,  7.37900000e+01, ...,
         7.37900000e+01,  7.93100000e+01,  8.06406313e-02],
       [ 5.52100000e+01,  5.27700000e+01,  5.52100000e+01, ...,
         5.52100000e+01,  5.27700000e+01, -1.70000002e-01]])

In [35]:
# prototype the filtering of the training response based on partial string
# in the column name
df_no2_api_train.filter(like='Lucknow', axis=1).to_numpy().flatten()[:20]

array([26.72571429, 30.78583333, 30.20708333, 33.28791667, 29.88666667,
       29.62625   , 46.65666667, 39.83791667, 46.155     , 55.07916667,
       45.16125   , 39.26458333, 45.14125   , 47.96041667, 47.59708333,
       45.50041667, 45.43541667, 21.63625   , 18.60916667, 18.005     ])

In [36]:
# check again that there are no nan's in the dataframe
dft.isnull().sum().sum()

0

In [37]:
dfv.isnull().sum().sum()

0

In [38]:
dfg.isnull().sum().sum()

0

In [39]:
print(df_no2_api_train.shape)
print(dft.shape)
print(df_no2_api_train.filter(like='Delhi', axis=1).to_numpy().flatten().shape)

(403, 55)
(403, 264)
(403,)


In [40]:
# prototype random forest model for Lucknow with hyperparameter optimization
# performed using the unsampled points from the training data--out-of-box samples
# Training data for city of Lucknow
X_train = dft.filter(like='Lucknow', axis=1).to_numpy()
# training response for Lucknow from api data
y_train = df_no2_api_train.filter(like='Lucknow', axis=1).to_numpy().flatten()
# Set up hyperparameter optimization using oob samples
# max_depth (maximum depth of the tree) values
md_list = [6,7,8,9,10]
# min_samples_leaf (the minimum number of samples to be a leaf node) values
msl_list = [3,5,7,9,11]
# grid search with two hyperparameter--record oob score
# a large oob score is better
oob_arr = np.zeros((5,5))
for i, md in enumerate(md_list):
    for j, msl in enumerate(msl_list):
        regr = RandomForestRegressor(max_depth=md, random_state=37, max_features='sqrt',
                                     min_samples_leaf=msl, oob_score=True).fit(X_train, y_train)
        oob_arr[i,j] = regr.oob_score_
oob_arr


array([[0.48470681, 0.49240623, 0.49523365, 0.49944641, 0.49616   ],
       [0.49492191, 0.49122101, 0.49670123, 0.49388121, 0.49354365],
       [0.4794935 , 0.49208703, 0.49348692, 0.49243097, 0.49420434],
       [0.49195868, 0.50097124, 0.49500037, 0.49306393, 0.4944834 ],
       [0.48988659, 0.50129457, 0.4960898 , 0.49277788, 0.4944834 ]])

In [41]:
i,j = np.unravel_index(oob_arr.argmax(), oob_arr.shape)
>>> oob_arr[i,j]

0.5012945718132098

In [42]:
# show the values of the optimal hyperparameters
print(md_list[i])
print(msl_list[j])

10
5


In [43]:
# use optimum hyperparameters for model
rf = RandomForestRegressor(max_depth=10, random_state=37, max_features='sqrt',
                           min_samples_leaf=5, oob_score=True).fit(X_train, y_train)
rf.oob_score_

0.5012945718132098

In [44]:
importance = (rf.feature_importances_/np.max(rf.feature_importances_))*100
importance

array([ 67.00261571, 100.        ,  98.71069343,  96.1093989 ,
        24.02884404,  84.04079828,  73.69425285,  22.87573151])

In [45]:
dft.filter(like='Lucknow', axis=1).columns

Index(['Lucknow_NO2_2', 'Lucknow_NO2_3', 'Lucknow_SO2_2', 'Lucknow_SO2_3',
       'Lucknow_x', 'Lucknow_PM2.5_2', 'Lucknow_PM2.5_3', 'Lucknow_y'],
      dtype='object')

In [46]:
# make list of features that are greater than 90% of maximum importance
# record number of features that go into this list for each city
features = np.array(dft.filter(like='Lucknow', axis=1).columns)
print(list(features[importance > 80]))
# the length will help for recording the indices of the stored most important features
print(len(list(features[importance > 80])))

['Lucknow_NO2_3', 'Lucknow_SO2_2', 'Lucknow_SO2_3', 'Lucknow_PM2.5_2']
4


In [47]:
# predict the 2022 NO2 Lucknow values
# pull out the predictors from the validation set (2022 values)
X_val = dfv.filter(like='Lucknow', axis=1).to_numpy()
print(X_val.shape)
# predict no2 values for Lucknow--convert to form of array
rf_pred = rf.predict(dfv.filter(like='Lucknow', axis=1).to_numpy())
print(rf_pred.shape)

(205, 8)
(205,)


In [48]:
cities_api_no2 = [pull_out_city_name_after_(city) for city in list(df_no2_api.columns)[1:]]

In [49]:
# create combined dataframe that will be used to train the final model for
# each city after hyperparameter optimization
dftv = pd.concat([dft, dfv], axis=0)
print(f'dft (training) shape is: {dft.shape}')
print(f'dfv (validation) shape is: {dfv.shape}')
print(f'dftv (combined training and validation) shape is: {dftv.shape}\n')

# combine for response also
df_no2_api_train_validate = pd.concat([df_no2_api_train, df_no2_api_validate], axis=0)
print(f'df_no2_api_train (training) shape is: {df_no2_api_train.shape}')
print(f'df_no2_api_validate (validation) shape is: {df_no2_api_validate.shape}')
print(f'df_no2_api_train_validate shape is: {df_no2_api_train_validate.shape}')

df_so2_api_train_validate = pd.concat([df_so2_api_train, df_so2_api_validate], axis=0)
df_pm_api_train_validate = pd.concat([df_pm_api_train, df_pm_api_validate], axis=0)
df_co_api_train_validate = pd.concat([df_co_api_train, df_co_api_validate], axis=0)

dft (training) shape is: (403, 264)
dfv (validation) shape is: (205, 264)
dftv (combined training and validation) shape is: (608, 264)

df_no2_api_train (training) shape is: (403, 55)
df_no2_api_validate (validation) shape is: (205, 55)
df_no2_api_train_validate shape is: (608, 55)


In [50]:
# for creating NO2 api backfill

# initialize list of series of predictions for each city
pred_series_list_validate = []
pred_series_list_generate = []
# list of important features for each city
important_features = []
# index where important feature for each city begins
important_features_indices = []
ind = 0
for city in cities_api_no2:
    # Training data for city
    X_train = dft.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_no2_api_train.filter(like=city, axis=1).to_numpy().flatten()
    # Set up hyperparameter optimization using mse from predictions for validation set
    # max_depth (maximum depth of the tree) values
    md_list = [6,7,8,9,10]
    # min_samples_leaf (the minimum number of samples to be a leaf node) values
    msl_list = [3,5,7,9,11]
    # grid search with two hyperparameter--record oob score
    # a large oob score is better
    mse_arr = np.zeros((5,5))
    # create the predictors from 2022 for that city
    X_val = dfv.filter(like=city, axis=1).to_numpy()
    for i, md in enumerate(md_list):
        for j, msl in enumerate(msl_list):
            rf = RandomForestRegressor(max_depth=md,
                                       random_state=37,
                                       max_features='sqrt',
                                       min_samples_leaf=msl,
                                       oob_score=False).fit(X_train, y_train)
            # predict the 2022 NO2 values for the city
            rf_pred = rf.predict(X_val)
            # calculate the Mean Squared Error (MSE) of predictions
            api_true = df_no2_api_validate.filter(like=city, axis=1).to_numpy().flatten()
            assert rf_pred.shape[0] == api_true.shape[0]
            mse_arr[i,j] = np.sum((api_true - rf_pred)**2)/len(api_true)
            
    # find index of minimum MSE
    i,j = np.unravel_index(mse_arr.argmin(), mse_arr.shape)
    # use optimum hyperparameters for model
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=False).fit(X_train, y_train)
    
    # predict the 2022 NO2 values for the city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_validate.append(pd.Series(rf_pred, name=city))
    
    # use optimum hyperparameters for model using both training and validation
    # data to use the most data possible
    # define new X_train and y_train based on this combination
    # Training data for city
    X_train = dftv.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_no2_api_train_validate.filter(like=city, axis=1).to_numpy().flatten()   
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=True).fit(X_train, y_train)
    
    # calculate performance relative to max feature importance (as a % of max importance)
    importance = (rf.feature_importances_/np.max(rf.feature_importances_))*100
    # make an array of feature names for the city being considered from training data
    features = np.array(dftv.filter(like=city, axis=1).columns)
    # add features that are greater than 80% importance
    imp_f = list(features[importance > 80])
    important_features.extend(imp_f)
    important_features_indices.append(ind)
    ind += len(imp_f)
    
    # create the predictors for 2018 to Nov 2020 (generation set) for that city
    X_val = dfg.filter(like=city, axis=1).to_numpy()
    # predict the NO2 values for that city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_generate.append(pd.Series(rf_pred, name=city))
    
# create dataframe of validation predictions
# combine the training and validation data

df_no2_v_pred = pd.concat(pred_series_list_validate, axis=1)
# create dataframe of generation predictions--backfilling
df_no2_backfill = pd.concat(pred_series_list_generate, axis=1)
# create importance list for gas for each city
important_features_no2 = important_features
# create indices list for referencing the important features of each city
important_features_indices_no2 = important_features_indices

In [51]:
 df_no2_v_pred

Unnamed: 0,Agartala,Agra,Aizwal,Amritsar,Asansol,Aurangabad,Bareilly,Belgaum,Bengaluru,DurgBhilai,...,Prayagraj,Puducherry,Pune,Shillong,Solapur,Srinagar,Thiruvananthapuram,Varanasi,Vijaywada,Visakhapatnam
0,7.477845,37.388165,7.269498,31.080919,21.596322,11.508682,21.333697,10.165015,22.621055,39.245621,...,22.763669,12.772204,24.737661,13.085429,5.469560,67.682938,4.001449,30.387162,10.264523,5.269713
1,22.923606,37.720115,6.261353,29.305379,20.404703,11.852282,19.853197,10.049783,17.902024,39.382544,...,25.223456,9.841628,18.210827,11.849649,5.402746,70.615502,3.485705,36.392378,9.853081,14.465387
2,21.646586,39.837881,6.115289,29.309257,20.924567,10.612861,21.819318,11.095588,20.550720,38.554227,...,21.328443,8.114878,18.069934,12.298686,5.100553,66.742764,4.082023,36.766906,10.639653,24.778107
3,22.786228,35.436775,6.015683,29.304407,21.570863,10.171253,26.459581,9.826084,21.168757,36.423277,...,22.660283,6.467195,26.666146,10.265007,5.564402,66.495647,3.345384,36.169076,10.238596,26.580494
4,21.862723,38.573137,6.000704,24.813543,26.354822,11.305060,20.180975,9.457303,24.266724,39.043797,...,26.865758,9.037333,28.391350,12.580824,4.565999,70.615502,3.027361,30.943905,10.283621,32.245679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,16.739037,10.653250,4.010027,24.896833,28.221700,7.258040,14.374136,7.186127,19.124545,36.326900,...,21.712151,14.726691,10.764412,9.138153,4.502508,53.469998,1.990334,15.143074,8.728696,31.896794
201,7.599122,12.641100,7.115479,23.367759,31.338167,8.247347,18.156103,10.385725,21.059929,36.613243,...,22.038549,13.139860,10.719582,9.077237,4.569405,56.309013,2.677878,15.188694,9.830108,31.767128
202,7.511398,12.108681,2.575563,21.777329,30.427110,7.105727,18.075412,11.652122,19.401831,35.616018,...,17.563081,14.811866,11.957568,8.318823,4.503714,48.485580,2.313906,17.598227,10.057723,27.272219
203,7.612379,11.967664,2.904118,19.865124,31.280193,6.820635,21.677616,12.165618,17.548429,35.490350,...,22.946740,11.846373,10.290431,8.472819,4.848636,59.062936,2.267332,19.125189,8.443510,35.511929


In [52]:
df_no2_backfill

Unnamed: 0,Agartala,Agra,Aizwal,Amritsar,Asansol,Aurangabad,Bareilly,Belgaum,Bengaluru,DurgBhilai,...,Prayagraj,Puducherry,Pune,Shillong,Solapur,Srinagar,Thiruvananthapuram,Varanasi,Vijaywada,Visakhapatnam
0,17.264839,44.107199,2.512329,25.326167,23.180007,7.658020,18.834340,7.403002,18.926117,36.193017,...,21.951707,7.228283,28.111951,13.410942,8.666084,70.316267,2.535658,47.541362,9.821545,10.008854
1,15.749241,43.793897,4.187595,23.853268,22.031922,7.745440,25.488399,8.335413,17.409789,37.131136,...,23.321855,13.522864,28.986386,11.555356,5.509691,60.620897,2.214794,46.874768,8.960288,15.562393
2,20.355922,45.382822,6.042727,26.495765,25.867142,7.486712,17.245171,7.820934,17.955830,44.229157,...,23.137297,11.922413,18.787697,12.768194,6.166035,51.774152,2.943965,48.313385,9.068878,31.316445
3,13.396503,43.226246,6.668427,25.742818,17.708461,9.152949,17.086019,7.860434,13.387424,39.396856,...,17.666205,9.380679,29.648953,6.114220,4.858067,78.242631,3.375244,47.087255,8.963637,31.816948
4,15.420913,45.155049,6.754166,24.211728,27.838280,9.162415,12.106879,8.271769,19.202030,40.913540,...,21.755916,7.172137,27.293020,12.201839,7.405370,72.027103,2.845700,37.100506,9.566854,41.271365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,10.251491,38.201033,2.204631,19.983681,20.595962,10.203342,16.454077,7.674791,14.690933,34.198942,...,19.726479,8.353820,8.835096,8.690032,5.888856,62.427514,5.429425,29.608116,9.336651,8.178179
1054,11.598510,40.706084,3.019734,14.922060,20.357447,11.180366,13.056426,5.920250,13.350104,31.706950,...,16.513784,7.837860,20.063596,9.893779,4.334179,56.036451,4.657788,33.271156,9.411966,12.494394
1055,10.433632,29.828958,3.864807,20.879898,23.681303,10.806953,16.684520,9.015261,24.018007,30.052665,...,18.791228,6.312115,17.497756,10.079497,6.870104,67.632129,3.627700,40.839071,8.403595,12.136889
1056,24.032638,42.664508,2.425078,18.538636,24.435050,10.046721,13.350351,9.257592,18.764387,38.063970,...,17.444218,6.903079,24.672425,11.547199,6.566277,56.631381,4.413481,36.211959,8.562083,7.145432


In [53]:
print(important_features_no2)

['Agartala_x', 'Agra_NO2_1', 'Aizwal_x', 'Amritsar_NO2_1', 'Asansol_x', 'Asansol_y', 'Aurangabad_NO2_1', 'Bareilly_x', 'Bareilly_y', 'Belgaum_x', 'Belgaum_y', 'Bengaluru_NO2_6', 'Bengaluru_NO2_8', 'Bengaluru_SO2_8', 'Bengaluru_x', 'Bengaluru_PM2.5_6', 'Bengaluru_y', 'DurgBhilai_x', 'DurgBhilai_y', 'Bhopal_x', 'Bhopal_y', 'Chandigarh_y', 'Chennai_SO2_2', 'Chennai_x', 'Coimbatore_y', 'Dehradun_x', 'Delhi_NO2_11', 'Delhi_PM2.5_9', 'Delhi_CO_11', 'Faridabad_NO2_1', 'Faridabad_SO2_1', 'Faridabad_PM2.5_1', 'Gandhinagar_x', 'Gangtok_y', 'Ghaziabad_NO2_1', 'Ghaziabad_SO2_1', 'Ghaziabad_PM2.5_1', 'Gorakhpur_y', 'Gwalior_x', 'Hyderabad_NO2_2', 'Hyderabad_PM2.5_2', 'Imphal_x', 'Imphal_y', 'Indore_x', 'Indore_y', 'Jabalpur_x', 'Jaipur_NO2_2', 'Jaipur_SO2_2', 'Jaipur_PM2.5_2', 'Jalandhar_x', 'Jalandhar_y', 'Jodhpur_NO2_1', 'Kanpur_SO2_1', 'Kanpur_SO2_2', 'Kanpur_PM2.5_1', 'Kochi_x', 'Kohima_y', 'Kolkata_x', 'Kota_NO2_1', 'Kozhikode_x', 'Kozhikode_y', 'Lucknow_SO2_2', 'Lucknow_PM2.5_2', 'Ludhiana_NO

In [54]:
print(important_features_indices_no2)

[0, 1, 2, 3, 4, 6, 7, 9, 11, 17, 19, 21, 22, 24, 25, 26, 29, 32, 33, 34, 37, 38, 39, 41, 43, 45, 46, 49, 51, 52, 55, 56, 57, 58, 59, 61, 63, 66, 67, 68, 70, 71, 73, 74, 76, 77, 78, 79, 80, 81, 82, 86, 89, 90]


In [55]:
important_features_no2[86:89]

['Varanasi_NO2_1', 'Varanasi_SO2_1', 'Varanasi_PM2.5_1']

In [56]:
# for creating SO2 api backfill

# initialize list of series of predictions for each city
pred_series_list_validate = []
pred_series_list_generate = []
# list of important features for each city
important_features = []
# index where important feature for each city begins
important_features_indices = []
ind = 0
for city in cities_api_no2:
    # Training data for city
    X_train = dft.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_so2_api_train.filter(like=city, axis=1).to_numpy().flatten()
    # Set up hyperparameter optimization using mse from predictions for validation set
    # max_depth (maximum depth of the tree) values
    md_list = [6,7,8,9,10]
    # min_samples_leaf (the minimum number of samples to be a leaf node) values
    msl_list = [3,5,7,9,11]
    # grid search with two hyperparameter--record oob score
    # a large oob score is better
    mse_arr = np.zeros((5,5))
    # create the predictors from 2022 for that city
    X_val = dfv.filter(like=city, axis=1).to_numpy()
    for i, md in enumerate(md_list):
        for j, msl in enumerate(msl_list):
            rf = RandomForestRegressor(max_depth=md,
                                       random_state=37,
                                       max_features='sqrt',
                                       min_samples_leaf=msl,
                                       oob_score=False).fit(X_train, y_train)
            # predict the 2022 SO2 values for the city
            rf_pred = rf.predict(X_val)
            # calculate the Mean Squared Error (MSE) of predictions
            api_true = df_so2_api_validate.filter(like=city, axis=1).to_numpy().flatten()
            assert rf_pred.shape[0] == api_true.shape[0]
            mse_arr[i,j] = np.sum((api_true - rf_pred)**2)/len(api_true)
            
    # find index of minimum MSE
    i,j = np.unravel_index(mse_arr.argmin(), mse_arr.shape)
    # use optimum hyperparameters for model
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=False).fit(X_train, y_train)
    
    # predict the 2022 SO2 values for the city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_validate.append(pd.Series(rf_pred, name=city))
    
    # use optimum hyperparameters for model using both training and validation
    # data to use the most data possible
    # define new X_train and y_train based on this combination
    # Training data for city
    X_train = dftv.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_so2_api_train_validate.filter(like=city, axis=1).to_numpy().flatten()   
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=True).fit(X_train, y_train)
    
    # calculate performance relative to max feature importance (as a % of max importance)
    importance = (rf.feature_importances_/np.max(rf.feature_importances_))*100
    # make an array of feature names for the city being considered from training data
    features = np.array(dftv.filter(like=city, axis=1).columns)
    # add features that are greater than 80% importance
    imp_f = list(features[importance > 80])
    important_features.extend(imp_f)
    important_features_indices.append(ind)
    ind += len(imp_f)
    
    # create the predictors for 2018 to Nov 2020 (generation set) for that city
    X_val = dfg.filter(like=city, axis=1).to_numpy()
    # predict the SO2 values for that city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_generate.append(pd.Series(rf_pred, name=city))
    
# create dataframe of validation predictions
# combine the training and validation data

df_so2_v_pred = pd.concat(pred_series_list_validate, axis=1)
# create dataframe of generation predictions--backfilling
df_so2_backfill = pd.concat(pred_series_list_generate, axis=1)
# create importance list for gas for each city
important_features_so2 = important_features
# create indices list for referencing the important features of each city
important_features_indices_so2 = important_features_indices

In [57]:
# for creating PM2.5 api backfill

# initialize list of series of predictions for each city
pred_series_list_validate = []
pred_series_list_generate = []
# list of important features for each city
important_features = []
# index where important feature for each city begins
important_features_indices = []
ind = 0
for city in cities_api_no2:
    # Training data for city
    X_train = dft.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_pm_api_train.filter(like=city, axis=1).to_numpy().flatten()
    # Set up hyperparameter optimization using mse from predictions for validation set
    # max_depth (maximum depth of the tree) values
    md_list = [6,7,8,9,10]
    # min_samples_leaf (the minimum number of samples to be a leaf node) values
    msl_list = [3,5,7,9,11]
    # grid search with two hyperparameter--record oob score
    # a large oob score is better
    mse_arr = np.zeros((5,5))
    # create the predictors from 2022 for that city
    X_val = dfv.filter(like=city, axis=1).to_numpy()
    for i, md in enumerate(md_list):
        for j, msl in enumerate(msl_list):
            rf = RandomForestRegressor(max_depth=md,
                                       random_state=37,
                                       max_features='sqrt',
                                       min_samples_leaf=msl,
                                       oob_score=False).fit(X_train, y_train)
            # predict the 2022 PM2.5 values for the city
            rf_pred = rf.predict(X_val)
            # calculate the Mean Squared Error (MSE) of predictions
            api_true = df_pm_api_validate.filter(like=city, axis=1).to_numpy().flatten()
            assert rf_pred.shape[0] == api_true.shape[0]
            mse_arr[i,j] = np.sum((api_true - rf_pred)**2)/len(api_true)
            
    # find index of minimum MSE
    i,j = np.unravel_index(mse_arr.argmin(), mse_arr.shape)
    # use optimum hyperparameters for model
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=False).fit(X_train, y_train)
    
    # predict the 2022 PM2.5 values for the city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_validate.append(pd.Series(rf_pred, name=city))
    
    # use optimum hyperparameters for model using both training and validation
    # data to use the most data possible
    # define new X_train and y_train based on this combination
    # Training data for city
    X_train = dftv.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_pm_api_train_validate.filter(like=city, axis=1).to_numpy().flatten()   
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=True).fit(X_train, y_train)
    
    # calculate performance relative to max feature importance (as a % of max importance)
    importance = (rf.feature_importances_/np.max(rf.feature_importances_))*100
    # make an array of feature names for the city being considered from training data
    features = np.array(dftv.filter(like=city, axis=1).columns)
    # add features that are greater than 80% importance
    imp_f = list(features[importance > 80])
    important_features.extend(imp_f)
    important_features_indices.append(ind)
    ind += len(imp_f)
    
    # create the predictors for 2018 to Nov 2020 (generation set) for that city
    X_val = dfg.filter(like=city, axis=1).to_numpy()
    # predict the PM2.5 values for that city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_generate.append(pd.Series(rf_pred, name=city))
    
# create dataframe of validation predictions
# combine the training and validation data

df_pm_v_pred = pd.concat(pred_series_list_validate, axis=1)
# create dataframe of generation predictions--backfilling
df_pm_backfill = pd.concat(pred_series_list_generate, axis=1)
# create importance list for gas for each city
important_features_pm = important_features
# create indices list for referencing the important features of each city
important_features_indices_pm = important_features_indices

In [58]:
# for creating CO api backfill

# initialize list of series of predictions for each city
pred_series_list_validate = []
pred_series_list_generate = []
# list of important features for each city
important_features = []
# index where important feature for each city begins
important_features_indices = []
ind = 0
for city in cities_api_no2:
    # Training data for city
    X_train = dft.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_co_api_train.filter(like=city, axis=1).to_numpy().flatten()
    # Set up hyperparameter optimization using mse from predictions for validation set
    # max_depth (maximum depth of the tree) values
    md_list = [6,7,8,9,10]
    # min_samples_leaf (the minimum number of samples to be a leaf node) values
    msl_list = [3,5,7,9,11]
    # grid search with two hyperparameter--record oob score
    # a large oob score is better
    mse_arr = np.zeros((5,5))
    # create the predictors from 2022 for that city
    X_val = dfv.filter(like=city, axis=1).to_numpy()
    for i, md in enumerate(md_list):
        for j, msl in enumerate(msl_list):
            rf = RandomForestRegressor(max_depth=md,
                                       random_state=37,
                                       max_features='sqrt',
                                       min_samples_leaf=msl,
                                       oob_score=False).fit(X_train, y_train)
            # predict the 2022 CO values for the city
            rf_pred = rf.predict(X_val)
            # calculate the Mean Squared Error (MSE) of predictions
            api_true = df_co_api_validate.filter(like=city, axis=1).to_numpy().flatten()
            assert rf_pred.shape[0] == api_true.shape[0]
            mse_arr[i,j] = np.sum((api_true - rf_pred)**2)/len(api_true)
            
    # find index of minimum MSE
    i,j = np.unravel_index(mse_arr.argmin(), mse_arr.shape)
    # use optimum hyperparameters for model
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=False).fit(X_train, y_train)
    
    # predict the 2022 CO values for the city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_validate.append(pd.Series(rf_pred, name=city))
    
    # use optimum hyperparameters for model using both training and validation
    # data to use the most data possible
    # define new X_train and y_train based on this combination
    # Training data for city
    X_train = dftv.filter(like=city, axis=1).to_numpy()
    # training response for city from api data
    y_train = df_co_api_train_validate.filter(like=city, axis=1).to_numpy().flatten()   
    rf = RandomForestRegressor(max_depth=md_list[i],
                               random_state=37,
                               max_features='sqrt',
                               min_samples_leaf=msl_list[j],
                               oob_score=True).fit(X_train, y_train)
    
    # calculate performance relative to max feature importance (as a % of max importance)
    importance = (rf.feature_importances_/np.max(rf.feature_importances_))*100
    # make an array of feature names for the city being considered from training data
    features = np.array(dftv.filter(like=city, axis=1).columns)
    # add features that are greater than 80% importance
    imp_f = list(features[importance > 80])
    important_features.extend(imp_f)
    important_features_indices.append(ind)
    ind += len(imp_f)
    
    # create the predictors for 2018 to Nov 2020 (generation set) for that city
    X_val = dfg.filter(like=city, axis=1).to_numpy()
    # predict the CO values for that city
    rf_pred = rf.predict(X_val)
    # convert these predictions into a series with the name of the city
    # append them to the list of predictions in the form of pandas series
    pred_series_list_generate.append(pd.Series(rf_pred, name=city))
    
# create dataframe of validation predictions
# combine the training and validation data

df_co_v_pred = pd.concat(pred_series_list_validate, axis=1)
# create dataframe of generation predictions--backfilling
df_co_backfill = pd.concat(pred_series_list_generate, axis=1)
# create importance list for gas for each city
important_features_co = important_features
# create indices list for referencing the important features of each city
important_features_indices_co = important_features_indices

In [59]:
backfill_dates = pd.Series(pd.date_range(start='1/1/2018', end='11/23/2020'), name='date')
backfill_dates

0      2018-01-01
1      2018-01-02
2      2018-01-03
3      2018-01-04
4      2018-01-05
          ...    
1053   2020-11-19
1054   2020-11-20
1055   2020-11-21
1056   2020-11-22
1057   2020-11-23
Name: date, Length: 1058, dtype: datetime64[ns]

In [60]:
NO2_col_names = list(df_no2_api_train_validate.columns)
SO2_col_names = list(df_so2_api_train_validate.columns)
PM2_5_col_names = list(df_pm_api_train_validate.columns)
CO_col_names = list(df_co_api_train_validate.columns)

In [61]:
# add date series for backfill dates
df_no2_backfill.insert(loc=0, column='date', value=backfill_dates)
df_no2_backfill

Unnamed: 0,date,Agartala,Agra,Aizwal,Amritsar,Asansol,Aurangabad,Bareilly,Belgaum,Bengaluru,...,Prayagraj,Puducherry,Pune,Shillong,Solapur,Srinagar,Thiruvananthapuram,Varanasi,Vijaywada,Visakhapatnam
0,2018-01-01,17.264839,44.107199,2.512329,25.326167,23.180007,7.658020,18.834340,7.403002,18.926117,...,21.951707,7.228283,28.111951,13.410942,8.666084,70.316267,2.535658,47.541362,9.821545,10.008854
1,2018-01-02,15.749241,43.793897,4.187595,23.853268,22.031922,7.745440,25.488399,8.335413,17.409789,...,23.321855,13.522864,28.986386,11.555356,5.509691,60.620897,2.214794,46.874768,8.960288,15.562393
2,2018-01-03,20.355922,45.382822,6.042727,26.495765,25.867142,7.486712,17.245171,7.820934,17.955830,...,23.137297,11.922413,18.787697,12.768194,6.166035,51.774152,2.943965,48.313385,9.068878,31.316445
3,2018-01-04,13.396503,43.226246,6.668427,25.742818,17.708461,9.152949,17.086019,7.860434,13.387424,...,17.666205,9.380679,29.648953,6.114220,4.858067,78.242631,3.375244,47.087255,8.963637,31.816948
4,2018-01-05,15.420913,45.155049,6.754166,24.211728,27.838280,9.162415,12.106879,8.271769,19.202030,...,21.755916,7.172137,27.293020,12.201839,7.405370,72.027103,2.845700,37.100506,9.566854,41.271365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,2020-11-19,10.251491,38.201033,2.204631,19.983681,20.595962,10.203342,16.454077,7.674791,14.690933,...,19.726479,8.353820,8.835096,8.690032,5.888856,62.427514,5.429425,29.608116,9.336651,8.178179
1054,2020-11-20,11.598510,40.706084,3.019734,14.922060,20.357447,11.180366,13.056426,5.920250,13.350104,...,16.513784,7.837860,20.063596,9.893779,4.334179,56.036451,4.657788,33.271156,9.411966,12.494394
1055,2020-11-21,10.433632,29.828958,3.864807,20.879898,23.681303,10.806953,16.684520,9.015261,24.018007,...,18.791228,6.312115,17.497756,10.079497,6.870104,67.632129,3.627700,40.839071,8.403595,12.136889
1056,2020-11-22,24.032638,42.664508,2.425078,18.538636,24.435050,10.046721,13.350351,9.257592,18.764387,...,17.444218,6.903079,24.672425,11.547199,6.566277,56.631381,4.413481,36.211959,8.562083,7.145432


In [62]:
# Make column names consistent with api column names
df_no2_backfill.columns = NO2_col_names
df_no2_backfill

Unnamed: 0,date,NO2_Agartala,NO2_Agra,NO2_Aizwal,NO2_Amritsar,NO2_Asansol,NO2_Aurangabad,NO2_Bareilly,NO2_Belgaum,NO2_Bengaluru,...,NO2_Prayagraj,NO2_Puducherry,NO2_Pune,NO2_Shillong,NO2_Solapur,NO2_Srinagar,NO2_Thiruvananthapuram,NO2_Varanasi,NO2_Vijaywada,NO2_Visakhapatnam
0,2018-01-01,17.264839,44.107199,2.512329,25.326167,23.180007,7.658020,18.834340,7.403002,18.926117,...,21.951707,7.228283,28.111951,13.410942,8.666084,70.316267,2.535658,47.541362,9.821545,10.008854
1,2018-01-02,15.749241,43.793897,4.187595,23.853268,22.031922,7.745440,25.488399,8.335413,17.409789,...,23.321855,13.522864,28.986386,11.555356,5.509691,60.620897,2.214794,46.874768,8.960288,15.562393
2,2018-01-03,20.355922,45.382822,6.042727,26.495765,25.867142,7.486712,17.245171,7.820934,17.955830,...,23.137297,11.922413,18.787697,12.768194,6.166035,51.774152,2.943965,48.313385,9.068878,31.316445
3,2018-01-04,13.396503,43.226246,6.668427,25.742818,17.708461,9.152949,17.086019,7.860434,13.387424,...,17.666205,9.380679,29.648953,6.114220,4.858067,78.242631,3.375244,47.087255,8.963637,31.816948
4,2018-01-05,15.420913,45.155049,6.754166,24.211728,27.838280,9.162415,12.106879,8.271769,19.202030,...,21.755916,7.172137,27.293020,12.201839,7.405370,72.027103,2.845700,37.100506,9.566854,41.271365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,2020-11-19,10.251491,38.201033,2.204631,19.983681,20.595962,10.203342,16.454077,7.674791,14.690933,...,19.726479,8.353820,8.835096,8.690032,5.888856,62.427514,5.429425,29.608116,9.336651,8.178179
1054,2020-11-20,11.598510,40.706084,3.019734,14.922060,20.357447,11.180366,13.056426,5.920250,13.350104,...,16.513784,7.837860,20.063596,9.893779,4.334179,56.036451,4.657788,33.271156,9.411966,12.494394
1055,2020-11-21,10.433632,29.828958,3.864807,20.879898,23.681303,10.806953,16.684520,9.015261,24.018007,...,18.791228,6.312115,17.497756,10.079497,6.870104,67.632129,3.627700,40.839071,8.403595,12.136889
1056,2020-11-22,24.032638,42.664508,2.425078,18.538636,24.435050,10.046721,13.350351,9.257592,18.764387,...,17.444218,6.903079,24.672425,11.547199,6.566277,56.631381,4.413481,36.211959,8.562083,7.145432


In [63]:
# add date series for backfill dates
# Make column names consistent with api column names
df_so2_backfill.insert(loc=0, column='date', value=backfill_dates)
df_so2_backfill.columns = SO2_col_names
df_pm_backfill.insert(loc=0, column='date', value=backfill_dates)
df_pm_backfill.columns = PM2_5_col_names
df_co_backfill.insert(loc=0, column='date', value=backfill_dates)
df_co_backfill.columns = CO_col_names

In [64]:
# concatenate vertically the backfilled api with the original api
df_no2_all = pd.concat([df_no2_backfill, df_no2_api_train_validate], ignore_index=True)
df_no2_all

Unnamed: 0,date,NO2_Agartala,NO2_Agra,NO2_Aizwal,NO2_Amritsar,NO2_Asansol,NO2_Aurangabad,NO2_Bareilly,NO2_Belgaum,NO2_Bengaluru,...,NO2_Prayagraj,NO2_Puducherry,NO2_Pune,NO2_Shillong,NO2_Solapur,NO2_Srinagar,NO2_Thiruvananthapuram,NO2_Varanasi,NO2_Vijaywada,NO2_Visakhapatnam
0,2018-01-01,17.264839,44.107199,2.512329,25.326167,23.180007,7.658020,18.834340,7.403002,18.926117,...,21.951707,7.228283,28.111951,13.410942,8.666084,70.316267,2.535658,47.541362,9.821545,10.008854
1,2018-01-02,15.749241,43.793897,4.187595,23.853268,22.031922,7.745440,25.488399,8.335413,17.409789,...,23.321855,13.522864,28.986386,11.555356,5.509691,60.620897,2.214794,46.874768,8.960288,15.562393
2,2018-01-03,20.355922,45.382822,6.042727,26.495765,25.867142,7.486712,17.245171,7.820934,17.955830,...,23.137297,11.922413,18.787697,12.768194,6.166035,51.774152,2.943965,48.313385,9.068878,31.316445
3,2018-01-04,13.396503,43.226246,6.668427,25.742818,17.708461,9.152949,17.086019,7.860434,13.387424,...,17.666205,9.380679,29.648953,6.114220,4.858067,78.242631,3.375244,47.087255,8.963637,31.816948
4,2018-01-05,15.420913,45.155049,6.754166,24.211728,27.838280,9.162415,12.106879,8.271769,19.202030,...,21.755916,7.172137,27.293020,12.201839,7.405370,72.027103,2.845700,37.100506,9.566854,41.271365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1661,2022-07-21,3.673333,23.876667,1.473333,9.126667,24.278333,5.448333,5.311667,3.433333,9.483333,...,33.360000,12.850000,5.271667,4.758333,5.398333,44.181667,1.761667,23.591667,8.840000,48.265000
1662,2022-07-22,5.353333,18.250417,1.650833,12.227500,27.154167,6.316667,4.910000,2.952917,13.311250,...,27.267083,24.187500,6.067500,6.657917,5.472083,34.022917,1.258333,21.520417,9.739583,38.986250
1663,2022-07-23,3.725833,10.918750,0.981250,9.796250,33.144167,6.017083,6.728750,2.320833,8.972083,...,15.347500,20.500833,4.238750,4.978750,3.463333,51.305000,1.280417,13.258333,6.675833,28.604167
1664,2022-07-24,4.888333,7.817500,1.329167,9.607500,30.603333,10.397917,3.702917,2.599167,11.956250,...,15.198333,20.898333,5.974167,9.666250,3.761667,40.118333,2.370833,11.602917,10.950833,33.443750


In [65]:
# concatenate vertically the backfilled api with the original api
df_so2_all = pd.concat([df_so2_backfill, df_so2_api_train_validate], ignore_index=True)
df_pm_all = pd.concat([df_pm_backfill, df_pm_api_train_validate], ignore_index=True)
df_co_all = pd.concat([df_co_backfill, df_co_api_train_validate], ignore_index=True)

In [66]:
# rename one column that had a string, "Durg" added to the front of the city name
df_no2_all = df_no2_all.rename(columns={'NO2_DurgBhilai': 'NO2_Bhilai'})
df_so2_all = df_so2_all.rename(columns={'SO2_DurgBhilai': 'SO2_Bhilai'})
df_pm_all = df_pm_all.rename(columns={'PM2_5_DurgBhilai': 'PM2_5_Bhilai'})
df_co_all = df_co_all.rename(columns={'CO_DurgBhilai': 'CO_Bhilai'})

In [67]:
# export backfilled dataframe as csv's
df_no2_all.to_csv('no2_api_backfill.csv', encoding='utf-8', index=False)
df_so2_all.to_csv('so2_api_backfill.csv', encoding='utf-8', index=False)
df_pm_all.to_csv('pm_api_backfill.csv', encoding='utf-8', index=False)
df_co_all.to_csv('co_api_backfill.csv', encoding='utf-8', index=False)

In [68]:
# export validation set (2022) predictions that could
# be compared with actual validation set numbers
df_no2_v_pred.to_csv('no2_api_validation_predictions.csv', encoding='utf-8', index=False)
df_so2_v_pred.to_csv('so2_api_validation_predictions.csv', encoding='utf-8', index=False)
df_pm_v_pred.to_csv('pm_api_validation_predictions.csv', encoding='utf-8', index=False)
df_co_v_pred.to_csv('co_api_validation_predictions.csv', encoding='utf-8', index=False)

In [69]:
# export important features and indices
pd.DataFrame(important_features_no2).to_csv('important_features_no2.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_indices_no2).to_csv('important_features_indices_no2.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_so2).to_csv('important_features_so2.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_indices_so2).to_csv('important_features_indices_so2.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_pm).to_csv('important_features_pm.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_indices_pm).to_csv('important_features_indices_pm.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_co).to_csv('important_features_co.csv',
                                             encoding='utf-8', index=False)
pd.DataFrame(important_features_indices_co).to_csv('important_features_indices_co.csv',
                                             encoding='utf-8', index=False)

In [70]:
# create sets for the important feature of each gas
no2_set = set(important_features_no2)
so2_set = set(important_features_so2)
pm_set = set(important_features_pm)
co_set = set(important_features_co)

In [71]:
# find the intersection of the important features list for each gas
# note "_x" is NO2 and "_y" is SO2, both satellite data
no2_set & so2_set & pm_set & co_set

{'Agartala_x',
 'Asansol_x',
 'Bareilly_x',
 'Belgaum_x',
 'Belgaum_y',
 'Bhopal_x',
 'Bhopal_y',
 'Chandigarh_y',
 'Coimbatore_y',
 'DurgBhilai_x',
 'Gandhinagar_x',
 'Ghaziabad_NO2_1',
 'Ghaziabad_PM2.5_1',
 'Ghaziabad_SO2_1',
 'Gorakhpur_y',
 'Gwalior_x',
 'Hyderabad_NO2_2',
 'Indore_y',
 'Kanpur_PM2.5_1',
 'Kanpur_SO2_1',
 'Kanpur_SO2_2',
 'Kochi_x',
 'Kolkata_x',
 'Kozhikode_x',
 'Lucknow_PM2.5_2',
 'Lucknow_SO2_2',
 'Ludhiana_PM2.5_1',
 'Mangalore_x',
 'Meerut_x',
 'Moradabad_x',
 'Moradabad_y',
 'Mumbai_x',
 'Mysuru_y',
 'Patna_PM2.5_1',
 'Patna_SO2_1',
 'Prayagraj_x',
 'Puducherry_x',
 'Pune_x',
 'Shillong_x',
 'Srinagar_x',
 'Thiruvananthapuram_PM2.5_1',
 'Thiruvananthapuram_SO2_1',
 'Varanasi_NO2_1',
 'Varanasi_PM2.5_1',
 'Varanasi_SO2_1',
 'Vijaywada_y'}

In [72]:
# official station (not satellite data) that are important features for predicting
# NO2, SO2, PM2.5, and CO concentrations
[s for s in list(no2_set & so2_set & pm_set & co_set) if ('_x' not in s and '_y' not in s)]

['Lucknow_PM2.5_2',
 'Varanasi_SO2_1',
 'Kanpur_SO2_2',
 'Thiruvananthapuram_PM2.5_1',
 'Varanasi_NO2_1',
 'Hyderabad_NO2_2',
 'Ghaziabad_SO2_1',
 'Patna_PM2.5_1',
 'Patna_SO2_1',
 'Ludhiana_PM2.5_1',
 'Lucknow_SO2_2',
 'Thiruvananthapuram_SO2_1',
 'Kanpur_SO2_1',
 'Varanasi_PM2.5_1',
 'Ghaziabad_PM2.5_1',
 'Kanpur_PM2.5_1',
 'Ghaziabad_NO2_1']

In [73]:
all_satellite = [s for s in list(no2_set | so2_set | pm_set | co_set) if ('_x' in s or '_y' in s)]
len(all_satellite)

69

In [74]:
len(city_list_api)

54

In [75]:
satellite_city_set = set([pull_out_city_name_before_(city) for city in all_satellite])
len(satellite_city_set)

41

In [76]:
# for these cities, the satellite data is not an important feature
# for the prediction of any gases
city_list_api.symmetric_difference(satellite_city_set)

{'Agra',
 'Aurangabad',
 'Delhi',
 'Hyderabad',
 'Jaipur',
 'Jodhpur',
 'Kanpur',
 'Kota',
 'Lucknow',
 'Nashik',
 'Patna',
 'Varanasi',
 'Visakhapatnam'}