In [None]:
import yaml
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold
from sklearn.metrics import mean_squared_error
import pandas as pd
import pickle

class XGBoostModel:
	def __init__(self, config_path):
		with open(config_path, 'r') as file:
			self.config = yaml.safe_load(file)


	def train(self, X_train, y_train):
		model_params = self.config['xgboost_model']['parameters']
		train_params = self.config['xgboost_model']['train_params']

		# self.model = xgb.XGBRegressor(**model_params)
		dtrain = xgb.DMatrix(X_train, label=y_train)

		if self.config['xgboost_model']['hyperparameter_tuning']['enable']:
			param_grid = self.config['xgboost_model']['hyperparameter_tuning']['param_grid']
			n_iter = self.config['xgboost_model']['hyperparameter_tuning']['n_iter']
			scoring = self.config['xgboost_model']['hyperparameter_tuning']['scoring']
			random_state = self.config['xgboost_model']['hyperparameter_tuning']['random_state']

			self.model = xgb.XGBRegressor(**model_params)

			# kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

			self.model = RandomizedSearchCV(
				estimator = self.model,
				param_distributions = param_grid,
				n_iter = n_iter,
				scoring = scoring,
				cv = 5,
				random_state=random_state,
				verbose = 2

			)

			self.model.fit(X_train, y_train, **train_params)
		else:
			params = model_params.copy()
			params.update(train_params)

			xgb_cv = xgb.cv(
				params,
				dtrain,
				num_boost_round=params['n_estimators'],
				nfold=5,
				early_stopping_rounds=train_params['early_stopping_rounds'],
				verbose_eval=train_params['verbose_eval']
			)
			self.model = xgb.XGBRegressor(**model_params)
			self.model.set_params(n_estimators=len(xgb_cv))
			self.model.fit(X_train, y_train)

	def save_model(self):
		directory = self.config['xgboost_model']['model_saving']['directory']
		filename = self.config['xgboost_model']['model_saving']['filename']
		pickle.dump(self.model, open(f"{directory}/{filename}", 'wb'))


		
	



In [5]:
import pandas as pd
with open('data/raw/Test_data_with_elevation.csv', 'r') as file:
	test_df = pd.read_csv(file)

# find columns with missing values
missing_cols = test_df.columns[test_df.isnull().any()].to_list()

print(missing_cols)

['sulphurdioxide_so2_column_number_density', 'sulphurdioxide_so2_column_number_density_amf', 'sulphurdioxide_so2_slant_column_number_density', 'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle', 'sulphurdioxide_sensor_zenith_angle', 'sulphurdioxide_solar_azimuth_angle', 'sulphurdioxide_solar_zenith_angle', 'sulphurdioxide_so2_column_number_density_15km', 'carbonmonoxide_co_column_number_density', 'carbonmonoxide_h2o_column_number_density', 'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude', 'carbonmonoxide_sensor_azimuth_angle', 'carbonmonoxide_sensor_zenith_angle', 'carbonmonoxide_solar_azimuth_angle', 'carbonmonoxide_solar_zenith_angle', 'nitrogendioxide_no2_column_number_density', 'nitrogendioxide_tropospheric_no2_column_number_density', 'nitrogendioxide_stratospheric_no2_column_number_density', 'nitrogendioxide_no2_slant_column_number_density', 'nitrogendioxide_tropopause_pressure', 'nitrogendioxide_absorbing_aerosol_index', 'nitrogendioxide_cloud_f