In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

In [None]:
# загрузим данные
full_df = pd.read_csv('gold_recovery_full_new.csv').drop('date', axis=1)
train_df = pd.read_csv('gold_recovery_train_new.csv').drop('date', axis=1)
test_df = pd.read_csv('gold_recovery_test_new.csv').drop('date', axis=1)

In [None]:
# Заполним пропуски
def knn_impute_missing_values(dfs, n_neighbors=3):
    for df in dfs:
        # Выбираем только числовые колонки
        numeric_df = df.select_dtypes(include=[np.number])

        # Создаем экземпляр KNNImputer с заданным числом соседей
        imputer = KNNImputer(n_neighbors=n_neighbors)

        # Заполняем пропуски в числовых колонках
        imputed_data = imputer.fit_transform(numeric_df)

        # Обновляем исходный датафрейм с заполненными пропусками
        df[numeric_df.columns] = imputed_data

knn_impute_missing_values([full_df, train_df, test_df])

# Проверим, что пропусков нет
def check_missing_values(dfs):
    for df in dfs:
        assert df.isna().sum().sum() == 0

check_missing_values([full_df, train_df, test_df])

In [None]:
# Объявим функции для чистки колонок
def drop_missing_columns(df1, df2):
    checklist = [*df2.columns, 'rougher.output.recovery', 'final.output.recovery']
    missing_columns = [col for col in df1.columns if col not in checklist]
    df1.drop(missing_columns, axis=1, inplace=True)

def get_missing_columns(df1, df2):
    missing_columns = [col for col in df1.columns if col not in df2.columns]
    return missing_columns

In [None]:
# Удалим лишние колоник
drop_missing_columns(full_df, test_df)

In [None]:
# Проверим, что в лишних колонках только целевые
get_missing_columns(full_df, test_df)

In [None]:
"""
    В full_df остались только колоники для обучения + 2 целевые колонки.
    Делаем 2 набора данных. Один для rougher, второй для финальной фичи.
"""
X1 = full_df.drop(['final.output.recovery', 'rougher.output.recovery'], axis=1)
y1 = full_df['rougher.output.recovery']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

X2 = full_df.drop('final.output.recovery', axis=1)
y2 = full_df['final.output.recovery']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:
# Implement metrics

# Сделаем функцию для расчета метрики (взял твою для чистоты эксперимента)
def smape(target, predictions):
    result = abs(target - predictions) / ((abs(target) + abs(predictions)) / 2) * 100
    # result = result.fillna(value=0)

    result = tf.reduce_mean(result)
    return result

# Сделаем функцию для расчета итоговой метрики
def total_smape(smape_rougher, smape_final):
    return (0.25 * smape_rougher + 0.75 * smape_final)


In [None]:
# Define the trainer class
class TrainClass:
    def __init__(self, X_train, y_train, X_test, y_test) -> None:
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        # Создание полносвязной нейронной сети
        self.model = Sequential([
            Dense(128, activation='tanh', input_shape=(X_train.shape[1],)),
            Dropout(0.2),
            Dense(64, activation='tanh'),
            Dropout(0.2),
            Dense(64, activation='tanh'),
            Dropout(0.2),
            Dense(64, activation='tanh'),
            Dropout(0.2),
            Dense(1) 
        ])

        # Компиляция модели
        self.model.compile(optimizer=Adam(), loss='mse', metrics=[smape])  # Используем функцию потерь 'mse' и метрику sMAPE

        # Scale data
        self.scale()
    
    def scale(self):
        # Масштабирование данных
        scaler = StandardScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_test = scaler.transform(self.X_test)
    
    def validate(self, validate_set=True):
        # Валидация модели на тестовой выборке
        if validate_set:
            test_loss, test_metric = self.model.evaluate(self.X_test, self.y_test)
        else:
            test_loss, test_metric = self.model.evaluate(self.X_train, self.y_train)
        print("Test loss:", test_loss)
        print("Test metric (smape):", test_metric)
        return test_loss, test_metric


    def train(self, epochs=int(1e3), batch_size=32, verbose=1):
        # Обучение модели
        self.history = self.model.fit(
            self.X_train,
            self.y_train,
            validation_data=(self.X_test, self.y_test),
            epochs=epochs,
            batch_size=batch_size,
            )

In [None]:
# Implement history render funtion
def plot_history(history1, history2, title1, title2, metric):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    # Первый график (график обучения и валидации метрики для первой модели)
    ax1.plot(history1.history[metric], label='Training Metric')
    ax1.plot(history1.history[f'val_{metric}'], label='Validation Metric')
    ax1.set_title(title1)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('sMAPE')
    ax1.legend()

    # Второй график (график обучения и валидации метрики для второй модели)
    ax2.plot(history2.history[metric], label='Training Metric')
    ax2.plot(history2.history[f'val_{metric}'], label='Validation Metric')
    ax2.set_title(title2)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('sMAPE')
    ax2.legend()

    plt.show()

In [None]:
# Make trainer objects
rougher_model = TrainClass(X_train1, y_train1, X_test1, y_test1)
final_model = TrainClass(X_train2, y_train2, X_test2, y_test2)

In [None]:
# Train models
rougher_model.train()
final_model.train()

In [None]:
# Plot train history graphs
plot_history(rougher_model.history, final_model.history, 'Rougher', 'Final', 'smape')

In [None]:
# Print total results

_, smape_rougher = rougher_model.validate(validate_set=True)
_, smape_final = final_model.validate(validate_set=True)
totalSmape = total_smape(smape_rougher, smape_final)
print()
print(f'Total sMAPE: {totalSmape}')