In [None]:
import pandas as pd
import os

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.drop(columns=['GT_NO2'], inplace=True)

In [None]:
def extract_date_info(dataframe):
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], format='mixed')
    dataframe['DayOfWeek'] = dataframe['Date'].dt.dayofweek.astype('category')
    dataframe['Month'] = dataframe['Date'].dt.month.astype('category')
    dataframe['Year'] = dataframe['Date'].dt.year.astype('category')
    dataframe['Week'] = dataframe['Date'].dt.isocalendar().week.astype('category')
    dataframe['Season'] = get_season(dataframe['Date']).astype('category')
    dataframe.drop(['Date'], axis=1, inplace=True)
    return dataframe

def get_season(date_series):
    spring = ((date_series.dt.month == 3) & (date_series.dt.day >= 20)) | ((date_series.dt.month > 3) & (date_series.dt.month < 6)) | ((date_series.dt.month == 6) & (date_series.dt.day <= 20))
    summer = ((date_series.dt.month == 6) & (date_series.dt.day >= 21)) | ((date_series.dt.month > 6) & (date_series.dt.month < 9)) | ((date_series.dt.month == 9) & (date_series.dt.day <= 22))
    autumn = ((date_series.dt.month == 9) & (date_series.dt.day >= 23)) | ((date_series.dt.month > 9) & (date_series.dt.month < 12)) | ((date_series.dt.month == 12) & (date_series.dt.day <= 20))
    
    season_series = pd.Series(0, index=date_series.index)  # Domyślnie 0 dla wiosny
    season_series.loc[summer] = 1  # Lato
    season_series.loc[autumn] = 2  # Jesień
    season_series.loc[~(spring | summer | autumn)] = 3  # Zima
    
    return season_series

train = extract_date_info(train)
test = extract_date_info(test)

In [None]:
import requests

def get_elevation(lat, lon):
    query = f'https://api.open-elevation.com/api/v1/lookup?locations={lat},{lon}'
    r = requests.get(query).json()
    return r['results'][0]['elevation']


unique_train_locations = train[['LAT', 'LON']].drop_duplicates()
unique_test_locations = test[['LAT', 'LON']].drop_duplicates()

unique_train_locations['Elevation'] = unique_train_locations.apply(lambda r: get_elevation(r['LAT'], r['LON']), axis=1)
unique_test_locations['Elevation'] = unique_test_locations.apply(lambda r: get_elevation(r['LAT'], r['LON']), axis=1)

train = train.merge(unique_train_locations, on=['LAT', 'LON'], how='left')
test = test.merge(unique_test_locations, on=['LAT', 'LON'], how='left')

In [None]:
train['is_train'] = 1
test['is_train'] = 0
whole = pd.concat([train, test], axis=0, ignore_index=True).reset_index(drop=True)

whole.info()

In [None]:
whole['is_train'].value_counts()

In [None]:
whole.drop(columns=['ID', 'ID_Zindi'], inplace=True)

# Po klasyfikacji
# whole.drop(columns=['LAT', 'LON'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split


X, y = whole.drop(columns=['is_train'], axis=1), whole['is_train']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_val_score, GroupKFold


def define_model(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 10, 20),
        'num_leaves': trial.suggest_int('num_leaves', 10, 50),
        'max_depth': trial.suggest_int('max_depth', 5, 8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 500),
        'tree_learner': 'voting',
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 250),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1e-1, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1,
        'is_unbalance': True,
    }
    return lgb.LGBMClassifier(**params)

def objective(trial):
    model = define_model(trial)
    gkf = GroupKFold(n_splits=X['Season'].nunique())
    scores = cross_val_score(model, X, y, groups=X['Season'], cv=gkf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

In [None]:
study_lightgbm = optuna.create_study(direction='maximize', study_name='GeoAIWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective, n_trials=50)

In [None]:
lightgbm = define_model(study_lightgbm.best_trial)
lightgbm.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_test, lightgbm.predict(X_test)))

In [None]:
lgb.plot_importance(lightgbm, figsize=(20, 12), dpi=200)

In [None]:
lgb.plot_tree(lightgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay


ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=lightgbm.predict(X_test), cmap='Greys')

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(lightgbm, X, y, cv=GroupKFold(n_splits=X['DayOfWeek'].nunique()), groups=X['DayOfWeek'], n_jobs=-1, random_state=4, scoring='accuracy')

In [None]:
unique_train_locations.to_csv(os.path.join('data', 'train_elevation.csv'))
unique_test_locations.to_csv(os.path.join('data', 'test_elevation.csv'))