In [None]:
#Импорты
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import datetime
from cfg.config import DATA_PATH

In [None]:
#Открытие CSV
def read_files(file_list, separator):
    res = {}
    for key, val in file_list.items():
        res[key] = pd.read_csv(filepath_or_buffer = val, sep = separator, header=0)

    return res

In [None]:
#Загружаем данные
import os

data = {}
parsed_path = DATA_PATH["parsed"]
for file_name in os.listdir(parsed_path):
    file_path = parsed_path + file_name
    data[file_name.split(".")[0]] = pd.read_csv(file_path, sep = ";", header=0)
    

In [None]:
#Соединяем фреймы и разделяем на фичи и показатель
X = pd.DataFrame()
y = []
for key, val in data.items():
    X = X | val[["day_of_year", "cash", "temperature", "wind_speed", "precipitation"]]
    data["res_2023"] = data["ofd_2023"].merge(data["weather_2023"], on="day_of_year", how='left')
    display(data["res_2023"])


In [None]:
#Подготовка к тренировке только 2023 год
X = data['res_2023'][["day_of_year", "cash", "temperature", "wind_speed", "precipitation"]]
y = data['res_2023']['income'].values
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [None]:
display(data['res_2023'].head(5))
display(data['res_2023'].tail(5))

In [None]:
display(train_X)
display(train_y)

In [None]:
#Тренировка модели
model = RandomForestRegressor(random_state=1, n_estimators=200)
model.fit(train_X, train_y)

In [None]:
pd.Series(train_y).value_counts()

In [None]:
#Поиск гиперпараметров
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=2)
grid_search.fit(train_X, train_y)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

In [None]:
#Проверка модели
predictions = model.predict(val_X)
print("MAE: ", mean_absolute_error(val_y, predictions))
print("R2: ", r2_score(val_y, predictions))

In [None]:
#Худнее предсказание
errors = np.abs(val_y - predictions)
worst_index = np.argmax(errors)
print(f"Наихудшее предсказание на индексе {worst_index}")
print(val_X.iloc[worst_index-1])
print(val_X.iloc[worst_index])
print(val_X.iloc[worst_index+1])
print(val_y[worst_index])
print(predictions[worst_index])

In [None]:
#Графики
# visualization
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.interpolate import make_interp_spline

weather = pd.DataFrame({'Date': val_X["day_of_year"], 'temperature': val_X["temperature"]}).sort_values(by='Date')
input = pd.DataFrame({'Date': val_X["day_of_year"], 'Income': val_y}).sort_values(by='Date')
output = pd.DataFrame({'Date': val_X["day_of_year"], 'Income': predictions}).sort_values(by='Date')

fig, ax = plt.subplots(figsize=(100,5))

plt.axis([0,91,0,260000])
plt.bar(np.arange(91), input["Income"], 0.5, color='b')
plt.bar(np.arange(91)+0.5, output["Income"], 0.2, color='r')
plt.xticks(np.arange(91)+1.5*0.5, input["Date"])

plt.show()