In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train.info()

In [None]:
train.head()

### Rozbicie daty na składowe

In [None]:
def change_date(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe['dayofweek'] = dataframe['date'].dt.dayofweek.astype('category')
    dataframe['week'] = dataframe['date'].dt.isocalendar().week.astype('category')
    dataframe['month'] = dataframe['month'].astype('category')
    dataframe['quater'] = dataframe['date'].dt.quarter.astype('category')
    dataframe['hour'] = dataframe['hour'].astype('category')
    return dataframe


train, test = change_date(train), change_date(test)

### Wykrs przedstawiający wartość pm2_5 w kolejnych kwartałach roku

In [None]:
sns.barplot(data=train, x='quater', y='pm2_5')
plt.title('Jakość powietrza w kwartałach')

### Wykres przedstawiający jakość powietrza w krajach afrykańskich

In [None]:
sns.lineplot(data=train, x='date', y='pm2_5')
plt.title('Jakość powietrza w kolejnych dniach')

### Wykres przedstawiający wartość pm2_5 w zarejestrowanych godzinach

In [None]:
sns.barplot(data=train, x='hour', y='pm2_5')
plt.title('Jakość powietrza w poszczególnych godzinach')

### Wykres przedstawiający wartość pm2_5 w zależności od dnia tygodnia

In [None]:
sns.barplot(data=train, x='dayofweek', y='pm2_5')
plt.title('Jakość powietrza w każdym dniu tygodnia')

### Wykres przedstawiający wartość pm2_5 w zależności od miesiąca

In [None]:
sns.barplot(data=train, x='month', y='pm2_5')
plt.title('Jakość powietrza w każdym miesiącu')

### Korelacja wybranych kolumn z pm2_5

In [None]:
sns.heatmap(train[['month', 'dayofweek', 'hour', 'site_latitude', 'site_longitude', 'cloud_surface_albedo', 'quater', 'pm2_5']].corr(), annot=True, cmap='Greys')

## <center>Czyszczenie danych</center>
### 1. Imputacja, usuwanie kolumn, oraz inne cuda

In [None]:
from sklearn.impute import KNNImputer


def fill_x(column_name='site_latitude'):
    column_values = train[column_name].unique()
    for date in column_values:
        for i, column in enumerate(starts_with):
            similar_columns = [col for col in train.columns if col.startswith(column)]
            df = train.loc[train[column_name] == date, similar_columns].copy()
            if not df.empty:
                try:
                    train.loc[train[column_name] == date, similar_columns] = imputers[i].fit_transform(df)
                except ValueError:
                    train.drop(index=df.index, inplace=True)
                    train.reset_index(drop=True, inplace=True)

def fill_test(column_name='site_latitude'):
    column_values = test[column_name].unique()
    for date in column_values:
        for i, column in enumerate(starts_with):
            similar_columns = [col for col in test.columns if col.startswith(column)]
            df = test.loc[test[column_name] == date, similar_columns].copy()
            if not df.empty:
                test.loc[test[column_name] == date, similar_columns] = imputers[i].transform(df)

def drop_high_nans(dataframe):  # usuwamy kolumny o dużej liczbie wartości NaN
    columns_nans = []
    for i, el in enumerate(dataframe.columns):
        if dataframe[el].isna().sum() / len(dataframe) >= 0.9:
            columns_nans.append(el)
    dataframe.drop(columns_nans, axis=1, inplace=True)
    return dataframe

def drop_high_correlated_columns():
    matrix = train.corr(numeric_only=True).abs()
    upper_t = matrix.where(np.triu(np.ones_like(matrix, dtype=np.bool_), k=1))
    return [col for col in upper_t.columns if any(upper_t[col] > 0.99)]

def drop_low_correlated_columns_to_pm2_5():
    corr = train.corr()['pm2_5'].to_frame()
    return corr[(corr['pm2_5'] < 0.01) & (corr['pm2_5'] > -0.01)].index.to_numpy()

def subract_azimuth_zenith(dataframe):
    zenith_columns = [zenith for zenith in dataframe.columns if 'zenith' in zenith]
    azimuth_columns = [azimuth for azimuth in dataframe.columns if 'azimuth' in azimuth]
    for i, zenith in enumerate(zenith_columns):
        splitted = zenith.split('_')
        dataframe[f'{splitted[0]}_{splitted[1]}_diff'] = dataframe[zenith] - dataframe[azimuth_columns[i]]
        dataframe.drop(zenith_columns[i], axis=1, inplace=True)
        dataframe.drop(azimuth_columns[i], axis=1, inplace=True)
    return dataframe

def change_to_categorical():
    for column in categorical:
        train[column] = train[column].astype('category')
        test[column] = test[column].astype('category')


categorical = ['hour', 'month', 'dayofweek', 'site_latitude', 'site_longitude']
test_ids = test['id']
train.drop(columns=['id', 'city', 'country', 'date', 'site_id'], inplace=True)
test.drop(columns=['id', 'city', 'country', 'date', 'site_id'], inplace=True)
starts_with = train.columns.str.split('_', expand=True).levels[0].to_frame()
starts_with.drop(['month', 'hour', 'pm2', 'site'], inplace=True)
starts_with = starts_with[0].tolist()

imputers = [KNNImputer(n_neighbors=15, weights='distance') for _ in range(len(starts_with))]
train, test = drop_high_nans(train), drop_high_nans(test)
change_to_categorical()
# fill_x(), fill_test()
to_drop = drop_low_correlated_columns_to_pm2_5()
to_drop = np.concatenate((to_drop, drop_high_correlated_columns()), axis=0)
train, test = train.drop(columns=to_drop, axis=1), test.drop(columns=to_drop, axis=1)
# train, test = subract_azimuth_zenith(train), subract_azimuth_zenith(test)

### Wykresy pudełkowe wskazujące wartości odstające

In [None]:
from sympy import divisors


def plot_boxplots():
    for i, column_group in enumerate(starts_with):
        similar_columns = [col for col in train.columns if col.startswith(column_group)]
        if len(similar_columns) > 1:
            divs = divisors(len(similar_columns))
            if len(divs) % 2 == 0:
                rows, cols = divs[(len(divs) // 2) - 1], divs[len(divs) // 2]
            else:
                rows, cols = divs[len(divs) // 2], divs[len(divs) // 2]
            fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(40, 30), squeeze=False)
            fig.suptitle(column_group, fontsize=25)
            for j, column in enumerate(similar_columns):
                x_cord, y_cord = divmod(j, cols)
                train[column].plot(kind='box', ax=ax[x_cord, y_cord], fontsize=15)
            plt.show()


vertical_columns = [col for col in train.columns if 'number_density' in col]

# plot_boxplots()

### 2. Wskazanie kwantyli, od których są outliery

In [None]:
from scipy.stats import zscore

detect_outliers = zscore(train['pm2_5'])

quantiles = pd.DataFrame(list(zip(np.linspace(0.9, 1, 20), [np.quantile(detect_outliers, el) for el in np.linspace(0.9, 1, 20)])), columns=['quantile', 'zscore'])
quantiles

### 3. Usunięcie wartości odstających

In [None]:
def del_pm2_5_outliers():
    indexes_to_drop = []
    q1, q2 = np.quantile(detect_outliers, 0.01), np.quantile(detect_outliers, 0.95)
    for i, el in enumerate(detect_outliers):
        if q1 < el > q2:
            indexes_to_drop.append(i)
    train.drop(indexes_to_drop, inplace=True)
    train.reset_index(drop=True, inplace=True)


del_pm2_5_outliers()

train.info()

In [None]:
train.head()

## <center>Selekcja cech</center>

In [None]:
from sklearn.feature_selection import RFECV, RFE, SelectKBest, mutual_info_regression, f_regression
from sklearn.ensemble import RandomForestRegressor


def plot_feature_importance(sc, num_of_features):
    if isinstance(sc, RFECV) or isinstance(sc, RFE):
        scores = dict(zip(sc.feature_names_in_, sc.ranking_))
    else:
        scores = dict(zip(sc.feature_names_in_, sc.scores_))
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:num_of_features]
    scores_df = pd.DataFrame(scores, columns=['Feature', 'Score'])

    scores_df.plot(kind='bar', x='Feature', y='Score', figsize=(10, 6), rot=90, title='Oceny wybranych cech')
    plt.xlabel('Cecha')
    plt.ylabel('Ocena')


# selector = RFE(
#     estimator=RandomForestRegressor(
#         n_estimators=700, 
#         max_depth=7, 
#         random_state=4, 
#         n_jobs=-1, 
#         oob_score=True,
#         warm_start=True
#     ),
#     n_features_to_select=k,
# )
# k = 17
# selector = RFECV(
#     estimator=RandomForestRegressor(
#         n_estimators=400, 
#         max_depth=10, 
#         random_state=4, 
#         n_jobs=-1, 
#         oob_score=True, 
#         warm_start=True, 
#         ccp_alpha=1e-4
#     ),
#     min_features_to_select=k, 
#     cv=10, 
#     scoring='neg_root_mean_squared_error',
#     n_jobs=-1
# )
# selector.fit(train, y)
# train, test = selector.transform(train), selector.transform(test)
# 
# plot_feature_importance(selector, k)

## <center>Transformacja danych</center>
### 1. Standaryzacja danych

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

# Bez kategorycznych
# scale_columns = train.columns.difference(categorical)
# 
# scaler = make_column_transformer((StandardScaler(), scale_columns))
# 
# X_cat, test_cat = train[categorical], test[categorical]
# 
# train = pd.concat([pd.DataFrame(scaler.fit_transform(train[scale_columns]), columns=scaler.feature_names_in_), X_cat], axis=1)
# test = pd.concat([pd.DataFrame(scaler.transform(test[scale_columns]), columns=scaler.feature_names_in_), test_cat], axis=1)
# Wszystkie kolumny
scaler = StandardScaler()

train['pm2_5'] = train['pm2_5'].apply(lambda x: np.round(x, 3))
train_pm2_5 = train['pm2_5']
train, test = scaler.fit_transform(train.drop(columns='pm2_5', axis=1)), scaler.transform(test)
train, test = pd.concat([pd.DataFrame(train, columns=scaler.feature_names_in_), train_pm2_5], axis=1), pd.concat([test_ids, pd.DataFrame(test, columns=scaler.feature_names_in_)], axis=1)

### Zapis przekształconych danych

In [None]:
if 'transform' not in os.listdir(os.getcwd()):
    os.mkdir('transform')
train.to_csv(os.path.join('transform', 'train.csv'), index=False)
test.to_csv(os.path.join('transform', 'test.csv'), index=False)