In [121]:
# specifying the path to the directory containing the data
data_path = ''

In [122]:
#linear algebra and data handling
import pandas as pd
import numpy as np

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#Developing models
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest


#Evaluating models
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint,uniform

In [123]:
# Importing data
train = pd.read_csv(data_path+'Train.csv')
test = pd.read_csv(data_path+'Test.csv',index_col=0)
sub = pd.read_csv(data_path+'SampleSubmission.csv')

In [124]:
# Creating an identifier for the specific geometries whose data is captured
# This identifier is created using a combination of the 'lat' and 'lng' entries
train['lat'] = train['lat'].astype('str')
train['lng'] = train['lng'].astype('str')
train['loc_id'] = train[['lat', 'lng']].apply(lambda x: ','.join(x), axis=1)


test['lat'] = test['lat'].astype('str')
test['lng'] = test['lng'].astype('str')
test['loc_id'] = test[['lat', 'lng']].apply(lambda x: ','.join(x), axis=1)

In [125]:
# The feasibility of the methodology used by this model is that entries in
# the zindi test set must have corresponding entries in previous years
# so that the forecast can be made based on that
# This cell checks if that condition is met

train_locs =train.loc_id.unique()
testlocs=test.loc_id.unique()
commonlocs = set(testlocs).intersection(set(train_locs))
print(len(commonlocs)==len(testlocs))

True


In [126]:
# Extracting season feature out of the train data's 'date' column
v = train['date'].str.split('-',expand=True)
train['season'] = v[1].astype('float')
def ext_season(mth):
  if mth in [1,2,3]:
    return 1.0
  elif mth in [4,5,6]:
    return 2.0
  elif mth in [7,8,9]:
    return 3.0
  else:
    return 4.0
train['season'] = train['season'].apply(ext_season)

In [127]:
# Creating a year feature for the train data so each year's data can be
# separated. This allows us to easily use a previous year's data to predict
# that of the next
q = train['date'].str.split('-',expand=True)
train['year'] = q[0].astype('float')

In [128]:
# Breaking 2016 data into data for the different seasons in 2016.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2016 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2016
train_2016_winter = train[(train['year']==2016) & (train['season']==1.0)]
train_2016_winter_mean = train_2016_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2016_winter_mode = train_2016_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2016_winter = pd.merge(train_2016_winter_mode, train_2016_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2016_spring = train[(train['year']==2016) & (train['season']==2.0)]
train_2016_spring_mean = train_2016_spring.groupby('loc_id').mean().reset_index()
train_2016_spring_mode = train_2016_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2016_spring = pd.merge(train_2016_spring_mode, train_2016_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2016_summer = train[(train['year']==2016) & (train['season']==3.0)]
train_2016_summer_mean = train_2016_summer.groupby('loc_id').mean().reset_index()
train_2016_summer_mode = train_2016_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2016_summer = pd.merge(train_2016_summer_mode, train_2016_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2016_autumn = train[(train['year']==2016) & (train['season']==4.0)]
train_2016_autumn_mean = train_2016_autumn.groupby('loc_id').mean().reset_index()
train_2016_autumn_mode = train_2016_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2016_autumn = pd.merge(train_2016_autumn_mode, train_2016_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2016 = pd.concat([train_2016_winter,train_2016_spring,train_2016_summer,train_2016_autumn],axis=0).reset_index(drop=True)

  train_2016_winter_mean = train_2016_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2016_spring_mean = train_2016_spring.groupby('loc_id').mean().reset_index()
  train_2016_summer_mean = train_2016_summer.groupby('loc_id').mean().reset_index()
  train_2016_autumn_mean = train_2016_autumn.groupby('loc_id').mean().reset_index()


In [129]:
# Breaking 2017 data into data for the different seasons in 2017.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2017 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2017
train_2017_winter = train[(train['year']==2017) & (train['season']==1.0)]
train_2017_winter_mean = train_2017_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2017_winter_mode = train_2017_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2017_winter = pd.merge(train_2017_winter_mode, train_2017_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2017_spring = train[(train['year']==2017) & (train['season']==2.0)]
train_2017_spring_mean = train_2017_spring.groupby('loc_id').mean().reset_index()
train_2017_spring_mode = train_2017_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2017_spring = pd.merge(train_2017_spring_mode, train_2017_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2017_summer = train[(train['year']==2017) & (train['season']==3.0)]
train_2017_summer_mean = train_2017_summer.groupby('loc_id').mean().reset_index()
train_2017_summer_mode = train_2017_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2017_summer = pd.merge(train_2017_summer_mode, train_2017_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2017_autumn = train[(train['year']==2017) & (train['season']==4.0)]
train_2017_autumn_mean = train_2017_autumn.groupby('loc_id').mean().reset_index()
train_2017_autumn_mode = train_2017_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2017_autumn = pd.merge(train_2017_autumn_mode, train_2017_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2017 = pd.concat([train_2017_winter,train_2017_spring,train_2017_summer,train_2017_autumn],axis=0).reset_index(drop=True)

  train_2017_winter_mean = train_2017_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2017_spring_mean = train_2017_spring.groupby('loc_id').mean().reset_index()
  train_2017_summer_mean = train_2017_summer.groupby('loc_id').mean().reset_index()
  train_2017_autumn_mean = train_2017_autumn.groupby('loc_id').mean().reset_index()


In [130]:
# Breaking 2018 data into data for the different seasons in 2018.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2018 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2018
train_2018_winter = train[(train['year']==2018) & (train['season']==1.0)]
train_2018_winter_mean = train_2018_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2018_winter_mode = train_2018_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2018_winter = pd.merge(train_2018_winter_mode, train_2018_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2018_spring = train[(train['year']==2018) & (train['season']==2.0)]
train_2018_spring_mean = train_2018_spring.groupby('loc_id').mean().reset_index()
train_2018_spring_mode = train_2018_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2018_spring = pd.merge(train_2018_spring_mode, train_2018_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2018_summer = train[(train['year']==2018) & (train['season']==3.0)]
train_2018_summer_mean = train_2018_summer.groupby('loc_id').mean().reset_index()
train_2018_summer_mode = train_2018_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2018_summer = pd.merge(train_2018_summer_mode, train_2018_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2018_autumn = train[(train['year']==2018) & (train['season']==4.0)]
train_2018_autumn_mean = train_2018_autumn.groupby('loc_id').mean().reset_index()
train_2018_autumn_mode = train_2018_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2018_autumn = pd.merge(train_2018_autumn_mode, train_2018_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2018 = pd.concat([train_2018_winter,train_2018_spring,train_2018_summer,train_2018_autumn],axis=0).reset_index(drop=True)

  train_2018_winter_mean = train_2018_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2018_spring_mean = train_2018_spring.groupby('loc_id').mean().reset_index()
  train_2018_summer_mean = train_2018_summer.groupby('loc_id').mean().reset_index()
  train_2018_autumn_mean = train_2018_autumn.groupby('loc_id').mean().reset_index()


In [131]:
# Breaking 2019 data into data for the different seasons in 2019.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2019 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2019
train_2019_winter = train[(train['year']==2019) & (train['season']==1.0)]
train_2019_winter_mean = train_2019_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2019_winter_mode = train_2019_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2019_winter = pd.merge(train_2019_winter_mode, train_2019_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2019_spring = train[(train['year']==2019) & (train['season']==2.0)]
train_2019_spring_mean = train_2019_spring.groupby('loc_id').mean().reset_index()
train_2019_spring_mode = train_2019_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2019_spring = pd.merge(train_2019_spring_mode, train_2019_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2019_summer = train[(train['year']==2019) & (train['season']==3.0)]
train_2019_summer_mean = train_2019_summer.groupby('loc_id').mean().reset_index()
train_2019_summer_mode = train_2019_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2019_summer = pd.merge(train_2019_summer_mode, train_2019_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2019_autumn = train[(train['year']==2019) & (train['season']==4.0)]
train_2019_autumn_mean = train_2019_autumn.groupby('loc_id').mean().reset_index()
train_2019_autumn_mode = train_2019_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2019_autumn = pd.merge(train_2019_autumn_mode, train_2019_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2019 = pd.concat([train_2019_winter,train_2019_spring,train_2019_summer,train_2019_autumn],axis=0).reset_index(drop=True)

  train_2019_winter_mean = train_2019_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2019_spring_mean = train_2019_spring.groupby('loc_id').mean().reset_index()
  train_2019_summer_mean = train_2019_summer.groupby('loc_id').mean().reset_index()
  train_2019_autumn_mean = train_2019_autumn.groupby('loc_id').mean().reset_index()


In [132]:
# Breaking 2020 data into data for the different seasons in 2020.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2020 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2020
train_2020_winter = train[(train['year']==2020) & (train['season']==1.0)]
train_2020_winter_mean = train_2020_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2020_winter_mode = train_2020_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2020_winter = pd.merge(train_2020_winter_mode, train_2020_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2020_spring = train[(train['year']==2020) & (train['season']==2.0)]
train_2020_spring_mean = train_2020_spring.groupby('loc_id').mean().reset_index()
train_2020_spring_mode = train_2020_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2020_spring = pd.merge(train_2020_spring_mode, train_2020_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2020_summer = train[(train['year']==2020) & (train['season']==3.0)]
train_2020_summer_mean = train_2020_summer.groupby('loc_id').mean().reset_index()
train_2020_summer_mode = train_2020_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2020_summer = pd.merge(train_2020_summer_mode, train_2020_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2020_autumn = train[(train['year']==2020) & (train['season']==4.0)]
train_2020_autumn_mean = train_2020_autumn.groupby('loc_id').mean().reset_index()
train_2020_autumn_mode = train_2020_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2020_autumn = pd.merge(train_2020_autumn_mode, train_2020_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2020 = pd.concat([train_2020_winter,train_2020_spring,train_2020_summer,train_2020_autumn],axis=0).reset_index(drop=True)

  train_2020_winter_mean = train_2020_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2020_spring_mean = train_2020_spring.groupby('loc_id').mean().reset_index()
  train_2020_summer_mean = train_2020_summer.groupby('loc_id').mean().reset_index()
  train_2020_autumn_mean = train_2020_autumn.groupby('loc_id').mean().reset_index()


In [133]:
# Breaking 2021 data into data for the different seasons in 2021.
# To get a single observation for each unique geometry (identified by 'loc_id'),
# we group by 'loc_id' for each season in 2021 and aggregating by the mean value.
# To also tackle the issue of different 'aqi' values for the same geometry within
# the same year, we take the mode of 'aqi' value for the specific geometry for the
# specific season in 2021
train_2021_winter = train[(train['year']==2021) & (train['season']==1.0)]
train_2021_winter_mean = train_2021_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
train_2021_winter_mode = train_2021_winter.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2021_winter = pd.merge(train_2021_winter_mode, train_2021_winter_mean.drop('aqi',axis=1), on='loc_id')

train_2021_spring = train[(train['year']==2021) & (train['season']==2.0)]
train_2021_spring_mean = train_2021_spring.groupby('loc_id').mean().reset_index()
train_2021_spring_mode = train_2021_spring.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2021_spring = pd.merge(train_2021_spring_mode, train_2021_spring_mean.drop('aqi',axis=1), on='loc_id')

train_2021_summer = train[(train['year']==2021) & (train['season']==3.0)]
train_2021_summer_mean = train_2021_summer.groupby('loc_id').mean().reset_index()
train_2021_summer_mode = train_2021_summer.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2021_summer = pd.merge(train_2021_summer_mode, train_2021_summer_mean.drop('aqi',axis=1), on='loc_id')

train_2021_autumn = train[(train['year']==2021) & (train['season']==4.0)]
train_2021_autumn_mean = train_2021_autumn.groupby('loc_id').mean().reset_index()
train_2021_autumn_mode = train_2021_autumn.groupby('loc_id')['aqi'].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
train_2021_autumn = pd.merge(train_2021_autumn_mode, train_2021_autumn_mean.drop('aqi',axis=1), on='loc_id')

train_2021 = pd.concat([train_2021_winter,train_2021_spring,train_2021_summer,train_2021_autumn],axis=0).reset_index(drop=True)

  train_2021_winter_mean = train_2021_winter.groupby('loc_id', as_index=False).mean()#.reset_index()
  train_2021_spring_mean = train_2021_spring.groupby('loc_id').mean().reset_index()
  train_2021_summer_mean = train_2021_summer.groupby('loc_id').mean().reset_index()
  train_2021_autumn_mean = train_2021_autumn.groupby('loc_id').mean().reset_index()


In [134]:
# Extracting the features for the zindi test data based on the specific
# geometry given. This submission makes use of 2016 as the previous year
# data
protest = pd.merge(test,train_2021,on=['loc_id','season'],how='left')


In [135]:
# Creating a function that will take a particular year's data and return
# a dataframe of features (i.e the previous year's data) that can be used
# to predict the present year's 'aqi' values (also included in the returned
# dataframe) as a target.
def proc_train(data,prev_df):
  dt = data[['loc_id','season','aqi']]
  ndt= pd.merge(dt,prev_df,on=['loc_id','season'])
  return ndt

In [136]:
# applying above function to process train data for 2017,2018,2019,2020,2021
pretrain2017 = proc_train(train_2017,train_2016)
pretrain2018 = proc_train(train_2018,train_2017)
pretrain2019 = proc_train(train_2019,train_2018)
pretrain2020 = proc_train(train_2020,train_2019)
pretrain2021 = proc_train(train_2021,train_2020)

In [137]:
# combining processed train data for the various years into a single
# train dataframe
combo_train = pd.concat([pretrain2017,pretrain2018,pretrain2019,pretrain2020,pretrain2021]).reset_index(drop=True)

In [138]:
# renaming the target column
# NB: The target column is named 'aqi' and the aqi value of the previous year
# is name 'aqi_y'
combo_train = combo_train.rename(columns={'aqi_x':'aqi'})

In [139]:
# changing the 'aqi' column in the test data to reflect the name of the previous
# year's aqi value is seen in the train data
protest = protest.rename(columns={'aqi':'aqi_y'})

In [140]:
# how aqi values are distributed
combo_train['aqi'].value_counts()

2    460
4    380
3      3
1      3
6      2
Name: aqi, dtype: int64

In [141]:
# selecting only numeric components of the processed train data
combtrain_sel = combo_train.select_dtypes(include=[np.number])

In [142]:
# casting 'season' as string
combtrain_sel['season'] = combtrain_sel['season'].astype('str')

In [143]:
# Getting the 'season' column of the processed test data to look just like
# that of the train data
protest['season'] = protest['season'].astype('float')
protest['season'] = protest['season'].astype('str')

In [144]:
testseason_enc = pd.get_dummies(protest['season'],prefix='ssn')
fi_test = pd.concat([protest,testseason_enc],axis=1)
fi_test.drop('season',axis=1,inplace=True)

In [145]:
trainseason_enc = pd.get_dummies(combtrain_sel['season'],prefix='ssn')
fi_train = pd.concat([combtrain_sel,trainseason_enc],axis=1)
fi_train.drop('season',axis=1,inplace=True)

In [146]:
# making a copy of the train data
data = fi_train.copy()

In [147]:
#check for missing values
datanul = data.isnull().sum()
g=[i for i in datanul if i>0]

print('columns with missing values:%d'%len(g))

columns with missing values:0


In [148]:
# 1. Separating data into features (X) and the target (y)
X = data.drop(['aqi'], axis =1)
y = data['aqi']

print('X')
print(X.head(1))
print()
print('y')
print(y.head())

X
   aqi_y  temperature  precipitation   humidity  global_radiation  \
0      4     10.45557       1.648352  78.513647        126.493361   

   hydrometric_level        N        NE         E        SE  ...  lc_31  \
0          15.792456  2.36179  2.610759  2.443661  1.901245  ...    0.0   

   lc_32  lc_33  lc_41  lc_51    year  ssn_1.0  ssn_2.0  ssn_3.0  ssn_4.0  
0    0.0    0.0    0.0    0.0  2016.0        1        0        0        0  

[1 rows x 62 columns]

y
0    4
1    4
2    4
3    4
4    4
Name: aqi, dtype: int64


In [149]:
#encoding target labels (y) with values between 0 and n_classes-1
#using the LabelEncoder
label_encoder=LabelEncoder()
label_encoder.fit(y)
y_encoded=label_encoder.transform(y)
labels=label_encoder.classes_
classes=np.unique(y_encoded)

In [150]:
labels

array([1, 2, 3, 4, 6])

In [151]:
classes

array([0, 1, 2, 3, 4])

In [152]:
#splitting train data into training and validation sets
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded,test_size=0.2,stratify=y_encoded,random_state = 42)

In [153]:
# setting a random seed  for reproducibility
np.random.seed(5)

In [154]:
X_train.shape

(678, 62)

In [155]:
X_train.columns

Index(['aqi_y', 'temperature', 'precipitation', 'humidity', 'global_radiation',
       'hydrometric_level', 'N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'pm25',
       'pm10', 'o3', 'so2', 'no2', 'pm25_aqi', 'pm10_aqi', 'no2_aqi', 'o3_aqi',
       'so2_aqi', 'utm_x', 'utm_y', 'dtm_milan', 'aspect', 'dusaf15',
       'geologia', 'hillshade', 'ndvi_2019', 'plan_curvature',
       'profile_curvature', 'water_distance', 'slope', 'spi', 'tri', 'twi',
       'geo_0', 'geo_1', 'geo_2', 'geo_3', 'geo_4', 'geo_5', 'geo_6', 'lc_11',
       'lc_12', 'lc_14', 'lc_21', 'lc_22', 'lc_23', 'lc_31', 'lc_32', 'lc_33',
       'lc_41', 'lc_51', 'year', 'ssn_1.0', 'ssn_2.0', 'ssn_3.0', 'ssn_4.0'],
      dtype='object')

### Developing the Model

In [156]:
# Defining function that will run the fitting of the model
def runmodel(model,tuning_params,scorer=make_scorer(accuracy_score),n_iter=60):
    sc=StandardScaler()
    pipe=Pipeline(steps=[('sc',sc),('feature_selection',SelectKBest()),('classifier',model)])
    r_search=RandomizedSearchCV(pipe,tuning_params,n_jobs=-1,verbose=-1,scoring=scorer,cv=10,n_iter=n_iter,random_state=2)
    r_search.fit(X_train,y_train)
    return r_search

In [157]:
# creating an instance of a RandomForestClassifier to fit to the training data
model=RandomForestClassifier(n_jobs=-1,random_state=20,verbose=1)

# defining a parameter search space for hyperparameter tuning
tuning_params= {
    'classifier__n_estimators': randint(50, 501),  # Values between 50 and 500
    'classifier__max_depth': [None] + list(randint(1, 51).rvs(10)),  # Include 'None' for no limit
    'classifier__min_samples_split': randint(2, 21),  # Values between 2 and 20
    'classifier__min_samples_leaf': randint(1, 11),  # Values between 1 and 10
    'classifier__max_features': uniform(0.1, 0.9),  # Values between 0.1 and 1.0 for feature fraction
    'classifier__bootstrap': [True, False],  # True or False for bootstrapping
    'feature_selection__k': randint(1, X.shape[1]),  # Values between 800 and 1500
}


In [158]:
# fitting the model to the data
rf = runmodel(model,tuning_params,n_iter=30)

  f = msb / msw
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 247 out of 247 | elapsed:    0.7s finished


In [159]:
# The best score during training
rfac = rf.best_score_
rfac

0.9852941176470589

In [160]:
# score on the validation set
rftest = accuracy_score(rf.predict(X_test),y_test)
rftest

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 247 out of 247 | elapsed:    0.1s finished


0.9764705882352941

In [161]:
# Predicting on the zindi test data
values= rf.predict(fi_test[X.columns])

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 247 out of 247 | elapsed:    0.0s finished


In [162]:
fi_test

Unnamed: 0,ID,lat,lng,loc_id,aqi_y,temperature,precipitation,humidity,global_radiation,hydrometric_level,...,lc_31,lc_32,lc_33,lc_41,lc_51,year,ssn_1.0,ssn_2.0,ssn_3.0,ssn_4.0
0,ID_000000,45.171919,9.488997,"45.171919,9.488997",4,10.247397,1.675556,74.430607,142.907916,47.479411,...,0.0,0.0,0.0,0.0,0.0,2021.0,1,0,0,0
1,ID_000001,45.171919,9.488997,"45.171919,9.488997",2,16.991453,1.340659,71.873919,201.781224,44.150857,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,1,0,0
2,ID_000002,45.171919,9.488997,"45.171919,9.488997",2,19.848120,1.178261,71.684712,198.580809,43.152518,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,1,0
3,ID_000003,45.171919,9.488997,"45.171919,9.488997",4,10.375539,1.447826,85.141734,101.172679,43.410451,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,0,1
4,ID_000004,45.281956,8.988563,"45.281956,8.988563",4,10.057587,2.256055,72.166248,137.564512,48.240221,...,0.0,0.0,0.0,0.0,0.0,2021.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,ID_000155,45.607845,8.952897,"45.607845,8.952897",4,9.524755,1.944379,80.527489,97.752702,37.161603,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,0,1
156,ID_000156,45.613692,9.508122,"45.613692,9.508122",4,10.119018,2.520000,68.692053,138.100012,46.767520,...,0.0,0.0,0.0,0.0,0.0,2021.0,1,0,0,0
157,ID_000157,45.613692,9.508122,"45.613692,9.508122",2,16.786448,2.536264,64.637347,198.780000,43.489804,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,1,0,0
158,ID_000158,45.613692,9.508122,"45.613692,9.508122",2,19.496790,3.376087,67.136374,191.550968,42.488725,...,0.0,0.0,0.0,0.0,0.0,2021.0,0,0,1,0


In [163]:
# converting the predicted values to the exact aqi values using the label
# encoder object
conv_values = label_encoder.inverse_transform(values)

In [164]:
# presenting predicted values as a dataframe similar to the sample submission
sub['aqi'] = conv_values
sub

Unnamed: 0,ID,aqi
0,ID_000000,4
1,ID_000001,2
2,ID_000002,2
3,ID_000003,4
4,ID_000004,4
...,...,...
155,ID_000155,4
156,ID_000156,4
157,ID_000157,2
158,ID_000158,2


In [165]:
# exporting the submission dataframe to a csv file
sub.to_csv('Submission.csv',index=False)