## GeoAI Ground-level NO2 Estimation Challenge by ITU

### Author: Hubert Kłosowski 242424

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Load data

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.info()

In [None]:
test.info()

### Do sth with NaNs in target column

In [None]:
# train['GT_NO2'].interpolate(inplace=True, limit=1)
train.dropna(subset=['GT_NO2'], inplace=True)

### Extract date info

In [None]:
def extract_date_info(dataframe):
    dataframe['Date'] = pd.to_datetime(dataframe['Date'])
    dataframe['DayOfWeek'] = dataframe['Date'].dt.dayofweek
    dataframe['Month'] = dataframe['Date'].dt.month
    dataframe['Year'] = dataframe['Date'].dt.year
    dataframe['Week'] = dataframe['Date'].dt.isocalendar().week
    dataframe.drop(['Date'], axis=1, inplace=True)
    return dataframe

train = extract_date_info(train)
test = extract_date_info(test)

### Target values in day of week

In [None]:
sns.barplot(data=train, x='DayOfWeek', y='GT_NO2')

### Target values in months

In [None]:
sns.barplot(data=train, x='Month', y='GT_NO2')

### Target values in years

In [None]:
sns.barplot(data=train, x='Year', y='GT_NO2')

### Target values in weeks

In [None]:
sns.barplot(data=train, x='Week', y='GT_NO2')

### Map of id's in train dataset

In [None]:
import folium


my_map = folium.Map(
    location=(train['LAT'].mean(), train['LON'].mean()),
    zoom_start=7,
)

### Train Locations

In [None]:
unique_train_locations = train.groupby(['LAT', 'LON'])['GT_NO2'].mean().reset_index()

layer_train_map = folium.FeatureGroup(name='Train Locations', show=False)
for index, row in unique_train_locations.iterrows():
    folium.Marker(
        location=[row['LAT'], row['LON']],
        icon=folium.Icon(color='green', icon='home'),
        popup=f'Mean GT_NO2 level: {row["GT_NO2"]:.2f}',
    ).add_to(layer_train_map)
    
layer_train_map.add_to(my_map)

### Test Locations

In [None]:
unique_test_locations = test[['LAT', 'LON']].drop_duplicates()

layer_test_map = folium.FeatureGroup(name='Test Locations', show=False)
for index, row in unique_test_locations.iterrows():
    folium.Marker(
        location=[row['LAT'], row['LON']],
        icon=folium.Icon(color='red', icon='home'),
    ).add_to(layer_test_map)
    
layer_test_map.add_to(my_map)

In [None]:
folium.LayerControl().add_to(my_map)
my_map.save('my_map.html')

my_map

### Prepare data

In [None]:
test_ids = test['ID_Zindi']
train.reset_index(drop=True, inplace=True)
train.drop(columns=['ID', 'ID_Zindi'], axis=1, inplace=True)
test.drop(columns=['ID', 'ID_Zindi'], axis=1, inplace=True)

### Correlation Matrix

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(train.corr(), annot=True, cmap='Greys')

### Scatter plots of columns from original dataset

In [None]:
def plot_scatter():
    columns = [col for col in train.columns][:10]
    fig, ax = plt.subplots(nrows=len(columns) // 5, ncols=5, figsize=(25, 15))
    for i, col in enumerate(columns):
        x_cord, y_cord = divmod(i, 5)
        sns.scatterplot(data=train, x=col, y='GT_NO2', ax=ax[x_cord, y_cord], s=2)
        ax[x_cord, y_cord].set_title(f'Correlation between {col} and GT_NO2')
        ax[x_cord, y_cord].set_xlabel(col)
        ax[x_cord, y_cord].set_ylabel('GT_NO2')
plot_scatter()

### Distribution of values

In [None]:
train['GT_NO2'].plot(kind='hist')

### Identify outliers in target column using zscore

In [None]:
from scipy.stats import zscore


detect_outliers = zscore(train['GT_NO2'])

quantiles = pd.DataFrame(list(zip(np.linspace(0.98, 1, 21), [np.quantile(detect_outliers, el) for el in np.linspace(0.98, 1, 21)], [np.quantile(train['GT_NO2'], el) for el in np.linspace(0.98, 1, 21)])), columns=['quantile', 'zscore', 'GT_NO2'])
quantiles

### Deleting outliers in top 1.4% percentile

In [None]:
def del_pm2_5_outliers():
    indexes_to_drop = []
    q1, q2 = np.quantile(detect_outliers, 0.01), np.quantile(detect_outliers, 0.986)
    for i, el in enumerate(detect_outliers):
        if el > q2:
            indexes_to_drop.append(i)
    train.drop(indexes_to_drop, inplace=True)
    train.reset_index(drop=True, inplace=True)


del_pm2_5_outliers()

train.info()

### Training process

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X, y = train.drop(columns=['GT_NO2'], axis=1), train['GT_NO2']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, GroupKFold


def define_lightgbm_model(trial):
    params = {
        'objective': 'root_mean_squared_error',
        'boosting_type': 'gbdt',
        'max_bin': trial.suggest_int('max_bin', 10, 200),
        'num_leaves': trial.suggest_int('num_leaves', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 9e-2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 700),
        'tree_learner': 'voting',
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 250),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1,
    }
    return lgb.LGBMRegressor(**params)

def objective_lightgbm(trial):
    model = define_lightgbm_model(trial)
    gkf = GroupKFold(n_splits=X['DayOfWeek'].nunique())
    scores = cross_val_score(model, X, y, groups=X['DayOfWeek'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
study_lightgbm = optuna.create_study(direction='minimize', study_name='AirQualityWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective_lightgbm, n_trials=200)

In [None]:
lgb_model = define_lightgbm_model(study_lightgbm.best_trial)
lgb_model.fit(X, y)
lightgbm_params = ['max_bin', 'num_leaves', 'max_depth', 'learning_rate', 'n_estimators', 'bagging_fraction', 'colsample_bytree', 'min_data_in_leaf']
lgb_pred = lgb_model.predict(X_test)
root_mean_squared_error(y_test, lgb_pred)

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(lgb_model, X, y, cv=10, n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

### Best params

In [None]:
study_lightgbm.best_params

In [None]:
def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'GT_NO2': y_pred})], axis=1)
    final_df.to_csv(os.path.join('result', save_as), index=False)
    
save_to_csv(lgb_model.predict(test), 'lightgbm.csv')