## <center>Analiza poziomu PM2.5 w afrykańskich miastach</center>
### Zespół:
<ol>
    <li style='font-size: 20px'>Hubert Kłosowski 242424</li>
    <li style='font-size: 20px'>Krzysztof Kolanek 242425</li>
    <li style='font-size: 20px'>Kamil Małecki 242464</li>
</ol>

### Potrzebne importy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Wczytanie danych

In [None]:
data = pd.read_csv('data\\Train.csv')
test = pd.read_csv('data\\Test.csv')

data.info()

In [None]:
data.head()

### Rozbicie daty na składowe

In [None]:
def change_date(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe['day'] = dataframe['date'].dt.dayofweek.astype(np.int64)
    dataframe['month'] = dataframe['month'].astype(np.int64)
    return dataframe


data, test = change_date(data), change_date(test)

### Wykres przedstawiający jakość powietrza w krajach afrykańskich

In [None]:
sns.lineplot(data=data, x='date', y='pm2_5', hue='country')
plt.title('Jakość powietrza z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 w zarejestrowanych godzinach

In [None]:
sns.barplot(data=data, x='hour', y='pm2_5', hue='country')
plt.title('Jakość powietrza w poszczególnych godzinach z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 z zależności od dnia tygodnia

In [None]:
sns.barplot(data=data, x='day', y='pm2_5', hue='country')
plt.title('Jakość powietrza w każdym dniu tygodnia z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 z zależności od miesiąca

In [None]:
sns.barplot(data=data, x='month', y='pm2_5', hue='country')
plt.title('Jakość powietrza w każdym dniu tygodnia z podziałem na kraje')

### Korelacje poszczególnych grup kolumn

In [None]:
def correlation():
    for index, column in enumerate(starts_with):
        selected_columns = [col for col in data.columns if col.startswith(column) or col == 'pm2_5']
        if len(selected_columns) > 1:
            fig, ax = plt.subplots(figsize=(10, 10))
            sns.heatmap(data[selected_columns].corr(), annot=True, fmt='.2f', cmap='viridis', ax=ax)
            plt.tight_layout()
            plt.show()
        
def drop_high_correlated_columns(dataframe):
    matrix = dataframe.corr(numeric_only=True)
    upper = matrix.where(np.triu(np.ones(matrix.shape), k=1).astype(np.bool_))
    to_drop = [column for column in upper.columns if any(upper[column] >= 0.9)]
    return dataframe.drop(to_drop, axis=1)


final_ids = test['id']
starts_with = data.columns.str.split('_', expand=True).levels[0].to_frame()
starts_with.drop(['month', 'day', 'hour', 'pm2'], inplace=True)
starts_with = starts_with[0].tolist()
data, test = drop_high_correlated_columns(data), drop_high_correlated_columns(test)
data.drop(columns=['id', 'city', 'country', 'site_id', 'date'], inplace=True)
test.drop(columns=['id', 'city', 'country', 'site_id', 'date'], inplace=True)

correlation()

## <center>Czyszczenie danych</center>

### 1. Uzupełnienie wartości brakujących

In [None]:
from sklearn.impute import KNNImputer

def fill_based_on(dataframe, date_unit='day'):
    date_range = dataframe[date_unit].unique()
    for date in date_range:
        for i, column in enumerate(starts_with):
            similar_columns = [el for el in dataframe.columns if el.startswith(column)]
            df = dataframe.loc[dataframe[date_unit] == date, similar_columns]
            if not df.empty:
                dataframe.loc[dataframe[date_unit] == date, similar_columns] = imputers[i].fit_transform(df)
    return dataframe

def prepare_dataframe(dataframe):  # usuwamy kolumny o dużej liczbie wartości NaN
    to_drop = []
    for index, el in enumerate(dataframe.columns):
        if dataframe[el].isna().sum() / len(dataframe) >= 0.9:
            to_drop.append(el)
    dataframe.drop(to_drop, axis=1, inplace=True)
    return dataframe


imputers = [KNNImputer(n_neighbors=15, weights='distance') for _ in range(len(starts_with))]
data, test = prepare_dataframe(data), prepare_dataframe(test)
data, test = fill_based_on(data), fill_based_on(test)
ver = dict(zip(['sulphurdioxide_so2_column_number_density', 'carbonmonoxide_co_column_number_density', 'nitrogendioxide_no2_column_number_density', 'nitrogendioxide_stratospheric_no2_column_number_density'], [(-0.001 / data['sulphurdioxide_so2_column_number_density'].std(), 4), (2.25, 6), (1, 10), (19.5, 26)]))

### Wykresy pudełkowe wskazujące wartości odstające

In [None]:
from sympy import divisors

def plot_boxplots():
    for index, column_group in enumerate(starts_with):
        similar_columns = [col for col in data.columns if col.startswith(column_group)]
        if len(similar_columns) > 1:
            divs = divisors(len(similar_columns))
            if len(divs) % 2 == 0:
                rows, cols = divs[(len(divs) // 2) - 1], divs[len(divs) // 2]
            else:
                rows, cols = divs[len(divs) // 2], divs[len(divs) // 2]
            fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(40, 30), squeeze=False)
            fig.suptitle(column_group, fontsize=25)
            for j, column in enumerate(similar_columns):
                x_cord, y_cord = divmod(j, cols)
                data[column].plot(kind='box', ax=ax[x_cord][y_cord], fontsize=15)
                if column in ver.keys():
                    ax[x_cord][y_cord].axhline(y=ver.get(column)[1] * data[column].std(), color='red')
                    ax[x_cord][y_cord].axhline(y=ver.get(column)[0] * data[column].std(), color='green')
            plt.show()


# plot_boxplots()

### 2. Usunięcie wartości odstających

In [None]:
from scipy.stats import zscore

def del_outliers(dataframe):
    for column, zscore_range in ver.items():
        vec, indexes = zscore(dataframe[column]), []
        for j in range(len(vec)):
            if zscore_range[0] <= vec[j] >= zscore_range[1]:
                indexes.append(j)
        dataframe.drop(index=indexes, inplace=True)
        dataframe.reset_index(drop=True, inplace=True)
    return dataframe


data = del_outliers(data)

data.info()

In [None]:
data.head()

## <center>Selekcja cech</center>

In [None]:
from sklearn.feature_selection import SelectKBest, RFECV, RFE, mutual_info_regression, f_regression
from sklearn.ensemble import RandomForestRegressor

def plot_feature_importance(sc, num_of_features):
    if isinstance(sc, RFECV) or isinstance(sc, RFE):
        scores = dict(zip(sc.feature_names_in_, sc.ranking_))
    else:
        scores = dict(zip(sc.feature_names_in_, sc.scores_))
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:num_of_features]
    scores_df = pd.DataFrame(scores, columns=['Feature', 'Score'])
    
    scores_df.plot(kind='bar', x='Feature', y='Score', figsize=(10, 6), rot=90, title='Oceny wybranych cech')
    plt.xlabel('Cecha')
    plt.ylabel('Ocena')


X, y = data.drop(['pm2_5'], axis=1), data['pm2_5']
k = 25
selector = RFECV(
    estimator=RandomForestRegressor(
        n_estimators=700, 
        max_depth=7, 
        random_state=4, 
        n_jobs=-1, 
        oob_score=True,
        warm_start=True,
    ),
    min_features_to_select=k, 
    cv=10, 
    scoring='neg_root_mean_squared_error',
)
# selector = RFE(
#     estimator=RandomForestRegressor(
#         n_estimators=700, 
#         max_depth=7, 
#         random_state=4, 
#         n_jobs=-1, 
#         oob_score=True,
#         warm_start=True
#     ),
#     n_features_to_select=k,
# )
selector.fit(X, y)
X, test = selector.transform(X), selector.transform(test)

plot_feature_importance(selector, k)

## <center>Transformacja danych</center>

### Potrzebne importy

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

### 1. Wybór sposobu preprocessingu danych

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X, y)
test = scaler.transform(test)

### 2. Podział na zbiór testowy i treningowy

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

## <center>Część obliczeniowa</center>

### Potrzebne importy

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error

### Otrzymanie najlepszych parametrów

In [None]:
def give_the_best(clf):
    gs = GridSearchCV(clf, params, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=5)
    gs.fit(X_train, y_train)
    return gs.best_estimator_

def save_to_csv(y_pred, save_as):
    final_df = pd.concat([final_ids, pd.DataFrame.from_dict({'pm2_5': y_pred})], axis=1)
    final_df.to_csv(f'result\\{save_as}', index=False)

### <center>Regresja przy użyciu MLP</center>

In [None]:
# params = {
#     'hidden_layer_sizes': [(99, 141, 75)],
#     'activation': ['relu'],
#     'solver': ['adam'],
#     'max_iter': [1000],
#     'alpha': np.linspace(0.0001, 0.001, 10),
#     'batch_size': [64, 128, 256],
#     'learning_rate_init': np.linspace(0.001, 0.01, 10),
#     'warm_start': [True],
#     'early_stopping': [True],
#     'validation_fraction': [0.1]
# }
# 
# mlp = give_the_best(MLPRegressor())
# save_to_csv(mlp.predict(test), 'mlp.csv')
# print('Parametry MLP: ', mlp.get_params())
# print('RMSE: ', root_mean_squared_error(y_test, mlp.predict(X_test)))

### <center>PyTorch</center>

### Potrzebne importy

In [None]:
import torch
from torch import nn, optim

### 1. Wybór karty graficznej do nauki modelu

In [None]:
device = (
    'cuda'
    if torch.cuda.is_available()
    else 'mps'
    if torch.backends.mps.is_available()
    else 'cpu'
)

X_train_tensor = torch.tensor(X_train, device=device, dtype=torch.float)
X_test_tensor = torch.tensor(X_test, device=device, dtype=torch.float)
y_train_tensor = torch.tensor(y_train.to_numpy(), device=device, dtype=torch.float)
y_test_tensor = torch.tensor(y_test.to_numpy(), device=device, dtype=torch.float)
test_tensor = torch.tensor(test, device=device, dtype=torch.float)

### 2. Architektura sieci neuronowej

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(X_train_tensor.shape[1], 101),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(101, 145),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(145, 45),
            nn.ReLU(),
            nn.Linear(45, 1),
        )
        
    def forward(self, x):
        return self.layers(x)
    
def rmse_loss(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2))


model = Net().to(device=device)
criterion = rmse_loss
optimizer = optim.Adam(
    model.parameters(), 
    lr=0.001, 
    betas=(0.9, 0.999), 
    eps=1e-8, 
    weight_decay=1e-4, 
    amsgrad=True, 
    fused=True
)

### 3. Nauka sieci neuronowej na zbiorze treningowym

In [None]:
from torch.utils.data import DataLoader


model.train()

batch_size = 128
num_epochs = 150

final_train_tensor = torch.concat((X_train_tensor, y_train_tensor.unsqueeze(dim=1)), dim=1)
dataset = DataLoader(final_train_tensor, batch_size=batch_size, shuffle=True)
epoch_losses = [{'epoch': i, 'rmse': 0} for i in range(num_epochs)]

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_idx, batch in enumerate(dataset):
        inputs, targets = batch[:, :-1], batch[:, -1]
        batch_pred = model(inputs)
        optimizer.zero_grad()
        loss = criterion(targets, batch_pred.squeeze())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    epoch_losses[epoch].update({'rmse': epoch_loss / len(dataset)})
    print(f'Epoch: [{epoch + 1}/{num_epochs}] RMSE: {epoch_loss / len(dataset):.4f}')

### Jakość sieci neuronowej

In [None]:
sns.lineplot(data=pd.DataFrame(epoch_losses), x='epoch', y='rmse')

### 4. Testowanie sieci neuronowej

In [None]:
model.eval()

with torch.no_grad():
    pred = model(X_test_tensor)
    loss = criterion(y_test_tensor, pred.squeeze())
    print(f'RMSE: {loss.item():.4f}')

### Parametry modelu

In [None]:
print("Parametry modelu:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())
    
torch.save(model.state_dict(), 'model.pt')

## <center>Do wysłania</center>

In [None]:
with torch.no_grad():
    final_pred = model(test_tensor)
    save_to_csv(final_pred.squeeze().numpy(force=True), 'nn.csv')

### Dodatkowe informacje
<ol>
    <li>The 15km SO2 band is ingested only when solar_zenith_angle < 70.</li>
    <li>Because of noise on the data, negative vertical column values are often observed in particular over clean regions or for low SO2 emissions. It is recommended not to filter these values except for outliers, i.e. for vertical columns lower than -0.001 mol/m^2.</li>
</ol>