На входе есть множество характеристик, расчитанных по изображениям специальной камеры, снимающей небосвод. Наша задача: предсказать значение приходящей коротковолновой радиации по имеющемуся датасету с характеристиками этих изображений. Подробнее: https://github.com/MKrinitskiy/Sirius-AI4Climate-2024/blob/main/DASIO-dataset-description.md

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [85]:
# Входные данные

df = pd.read_csv("/content/drive/MyDrive/dataset_25perc.csv", sep=',')
df.sample(5)

Unnamed: 0,photo_name,photo_datetime,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],radiation_datetime,feature0,feature1,feature2,...,feature157,feature158,feature159,feature160,feature161,date-hour,datetime_UTC,lat,lon,sun_altitude
130490,img-2017-01-23T03-22-32devID1.jpg,2017-01-23 03:22:32,102.919878,-5.97049,15.257919,1.421545,2017-01-23 03:22:34,89.614945,332.347637,131.0,...,0.426487,0.4445,0.462944,0.482552,0.499735,2017-01-23-03,2017-01-23 03:22:32.346758,6.273145,95.334223,45.446832
224784,img-2021-08-17T08-49-49devID2.jpg,2021-08-17 08:49:49,183.568881,-14.404992,59.799672,-2.748321,2021-08-17 08:49:54,91.407532,298.084852,130.0,...,0.429046,0.443469,0.460651,0.474244,0.496511,2021-08-17-08,2021-08-17 08:49:48.848570,73.160971,79.900484,27.443516
177489,img-2019-12-04T13-16-26devID1.jpg,2019-12-04 13:16:26,10.329896,-10.803744,2.274472,-1.232006,2019-12-04 13:16:34,83.892426,202.2276,108.0,...,0.400199,0.411631,0.422266,0.433565,0.439082,2019-12-04-13,2019-12-04 13:16:26.096473,53.894098,9.145207,9.38485
251218,img-2021-07-31T08-39-58devID2.jpg,2021-07-31 08:39:58,308.570096,-36.865408,177.314081,1.421545,2021-07-31 08:40:04,112.169224,1216.979489,255.0,...,0.525224,0.569164,0.627518,0.698106,0.869327,2021-07-31-08,2021-07-31 08:39:57.619330,66.992616,8.652082,33.97245
221807,img-2021-08-04T07-23-29devID2.jpg,2021-08-04 07:23:29,106.047278,-29.094293,23.597652,-0.284309,2021-08-04 07:23:29,84.379597,539.470472,163.0,...,0.407577,0.431339,0.467797,0.530808,0.597574,2021-08-04-07,2021-08-04 07:23:29.323510,64.977453,40.058537,37.959154


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260433 entries, 0 to 260432
Columns: 174 entries, photo_name to sun_altitude
dtypes: float64(169), object(5)
memory usage: 345.7+ MB


In [87]:
# Взгляд на основные статистики данных

df.describe()

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],feature0,feature1,feature2,feature3,feature4,feature5,...,feature155,feature156,feature157,feature158,feature159,feature160,feature161,lat,lon,sun_altitude
count,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,...,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0,260433.0
mean,267.16201,-47.479956,65.45487,2.62206,94.951865,953.031869,205.86163,19.476226,0.095298,0.785111,...,0.41225,0.43094,0.4534,0.482219,0.522926,0.59122,0.718453,44.573726,15.61978,27.89118
std,268.825122,37.967731,72.174757,15.449046,14.169739,713.662483,61.633465,11.989895,0.925641,1.559171,...,0.054155,0.057816,0.06332,0.073023,0.092471,0.135947,0.214833,35.062012,43.859881,17.79901
min,5.022793,-210.293932,-106.710666,-172.101749,13.957575,69.77057,85.0,0.0,-2.751973,-1.793168,...,0.2627,0.2627,0.2627,0.2627,0.2627,0.288733,0.322899,-49.639077,-63.469069,-0.605456
25%,58.757205,-78.4693,16.300386,-4.264636,85.719352,287.648025,131.0,11.0,-0.833563,-0.464717,...,0.379925,0.393784,0.410468,0.428845,0.444266,0.462479,0.486141,19.756974,-25.966896,13.507289
50%,161.298005,-37.339256,41.035274,-0.758157,90.536358,786.75896,255.0,19.0,-0.051501,-0.004458,...,0.39684,0.412428,0.431838,0.462577,0.507011,0.571519,0.72845,59.499908,10.493023,25.086516
75%,404.382248,-13.078217,87.946268,3.222169,99.438184,1495.766397,255.0,28.0,1.028095,1.949433,...,0.426814,0.448987,0.4773,0.517514,0.581622,0.698805,0.963546,69.496262,57.963884,39.70229
max,1425.904687,21.22841,624.153148,111.54392,207.011743,10271.560471,255.0,94.0,2.907309,10.901657,...,0.984047,0.984047,0.984047,0.984047,0.984047,0.984047,0.986706,83.001788,102.269799,87.713263


In [88]:
# Удаляем имена изображений
df = df.drop('photo_name', axis = 1)

In [89]:
# Дату в datetime формат
df['photo_datetime'] = pd.to_datetime(df['photo_datetime'])

In [90]:
df['days'] = df['photo_datetime'].dt.day
df['year_date'] = df['photo_datetime'].dt.year

In [91]:
# Вычисляем разницу в секундах между значениями в столбце 'photo_datetime' и начальной датой
df['date_seconds'] = (df['photo_datetime'] - df['photo_datetime'].min()).dt.total_seconds()

In [92]:
df = df.sort_values("date_seconds", axis=0)

In [93]:
df.index=range(0,len(df))

In [94]:
# Переводим линейное представление времени в цикличное (т.е. 31 декабря и 1 января следующего года будут близки несмотря на разницу в год)
df['dateyear'] = (df['photo_datetime'].dt.dayofyear - 1) / 365
df['cos'] = np.cos(df['dateyear'] * 2 * np.pi)
df['sin'] = np.sin(df['dateyear'] * 2 * np.pi)

In [95]:
# Вывод важных полей
df[['CM3up[W/m2]', 'CG3up[W/m2]', 'sin', 'cos', 'lat', 'lon', 'sun_altitude','feature0','feature1','feature2', 'days']]

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],sin,cos,lat,lon,sun_altitude,feature0,feature1,feature2,days
0,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320,96.890459,1671.656306,255.0,17
1,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320,108.081154,1784.724755,255.0,17
2,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731,98.760528,1723.527972,255.0,17
3,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731,109.894952,1814.364325,255.0,17
4,160.824156,-86.903802,-0.967938,-0.251190,66.588477,-29.294644,14.739936,111.069528,1877.696783,255.0,17
...,...,...,...,...,...,...,...,...,...,...,...
260428,182.242105,-52.312866,-0.903356,-0.428892,55.545024,12.705800,30.696305,93.887588,205.434115,135.0,6
260429,181.483948,-52.123327,-0.903356,-0.428892,55.544092,12.705888,30.642495,93.766927,205.713884,131.0,6
260430,180.915329,-52.028557,-0.903356,-0.428892,55.542581,12.706041,30.552547,93.492497,209.849320,133.0,6
260431,180.725790,-52.028557,-0.903356,-0.428892,55.542266,12.706072,30.534529,94.056191,205.642272,144.0,6


In [96]:
df = df.loc[:, ~df.columns.str.contains('date')]

In [97]:
df_corr = df.corr()['CM3up[W/m2]'].sort_values()
print(df_corr)

CG3up[W/m2]   -0.524952
lat           -0.465449
feature84     -0.273190
feature111    -0.237629
feature143    -0.160054
                 ...   
CM3up[W/m2]    1.000000
feature114          NaN
feature115          NaN
feature116          NaN
feature117          NaN
Name: CM3up[W/m2], Length: 172, dtype: float64


In [98]:
df.head()

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],feature0,feature1,feature2,feature3,feature4,feature5,...,feature158,feature159,feature160,feature161,lat,lon,sun_altitude,days,cos,sin
0,143.102225,-88.135807,36.391559,0.473848,96.890459,1671.656306,255.0,31.0,1.121827,2.437911,...,0.493691,0.543301,0.702826,0.973412,66.584686,-29.28027,14.48832,17,-0.25119,-0.967938
1,143.102225,-88.135807,36.391559,0.473848,108.081154,1784.724755,255.0,43.0,0.813509,1.466658,...,0.507907,0.55859,0.708413,0.973412,66.584686,-29.28027,14.48832,17,-0.25119,-0.967938
2,159.023532,-87.37765,44.162673,0.379079,98.760528,1723.527972,255.0,31.0,1.083457,2.20496,...,0.506745,0.55849,0.713127,0.973412,66.586937,-29.289083,14.655731,17,-0.25119,-0.967938
3,159.023532,-87.37765,44.162673,0.379079,109.894952,1814.364325,255.0,42.0,0.787298,1.310879,...,0.521335,0.576076,0.719611,0.973412,66.586937,-29.289083,14.655731,17,-0.25119,-0.967938
4,160.824156,-86.903802,44.636522,0.568618,111.069528,1877.696783,255.0,44.0,0.787653,1.232283,...,0.525156,0.58541,0.734166,0.973412,66.588477,-29.294644,14.739936,17,-0.25119,-0.967938


## Задача регрессии

In [99]:
X = df.drop(['CM3up[W/m2]','CG3up[W/m2]','CM3down[W/m2]','CG3down[W/m2]'], axis = 1)
y = df['CM3up[W/m2]']

# CM3up[W/m2] --> Приходящая коротковолновая радиация

In [100]:
# Разбиваем данные на выборки так, чтобы одни дни были в тренировочной выборке, другие - в тестовой

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=0.2, random_state=42)

for train_index, test_index in gss.split(X, y, groups=df['days']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [101]:
# Проверка пересекающихся в рамках дня данных

X_test["days"].unique()

array([21, 22, 14, 29,  1,  6,  8], dtype=int32)

In [102]:
X_train["days"].unique()

# Данные не пересекаются

array([17, 18, 19, 20, 23, 24, 25, 12, 13, 15, 16, 26, 27, 28, 30,  2,  3,
        4,  5,  7,  9, 11, 31, 10], dtype=int32)

In [103]:
!pip install -r '/content/drive/MyDrive/Colab Notebooks/Sirius project/requirements.txt'



In [104]:
!pip install catboost



In [105]:
!pip install numpy==1.24.0



In [106]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV

In [107]:
# Задаем наборы параметров для подбора лучших
random_params = {'iterations': [100, 500, 1000],  # Количество итераций
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Скорость обучения
    'l2_leaf_reg': [1, 3, 5, 10],  # Регуляризация L2
    'bagging_temperature': [0.0, 0.2, 0.5, 1.0],  # Температура для бэггинга
    'random_strength': [0.0, 0.1, 0.5],  # Сила случайности
    'max_depth': [6, 8, 10],  # Максимальная глубина деревьев
    'subsample': [0.7, 0.8, 0.9, 1.0],  # Размер подвыборки
    'colsample_bylevel': [0.5, 0.7, 1.0]  # Размер подвыборки по уровням
                 }

In [108]:
# Устраняем проблемы с форматом имен признаков для обучения
X_train.columns = X_train.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_').str.replace('>', '_')
X_test.columns = X_test.columns.str.replace('[', '_').str.replace(']', '_').str.replace('<', '_').str.replace('>', '_')

In [109]:
from catboost import CatBoostRegressor

# Создаем модель
catboost = CatBoostRegressor(task_type='CPU', verbose=0)

model = RandomizedSearchCV(catboost, param_distributions=random_params, n_iter=10,
                                   cv=3,
                                   verbose=1,
                                   n_jobs=-1,
                                   random_state=42)

# Подбираем лучшие параметры случайным образом
model.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [110]:
# Лучшие параметры
model.best_params_

{'subsample': 0.9,
 'random_strength': 0.5,
 'max_depth': 10,
 'learning_rate': 0.1,
 'l2_leaf_reg': 5,
 'iterations': 500,
 'colsample_bylevel': 1.0,
 'bagging_temperature': 0.0}

In [111]:
preds = model.predict(X_test)

In [112]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

In [113]:
mean_absolute_percentage_error(preds,y_test)

0.2268178048879464

In [114]:
r2_score(preds,y_test)

0.9459944714950479

In [115]:
mean_absolute_error(preds,y_test)

31.066587954370554

In [116]:
root_mean_squared_error(preds,y_test)

63.18221664999499

In [117]:
preds_series = pd.Series(preds)
preds_series.index = y_test.index
preds_series.name = 'CG3up_predicted'
preds_series.head()

Unnamed: 0,CG3up_predicted
664,61.447778
665,67.925948
666,59.811968
667,45.66884
668,16.67312


In [118]:
results = pd.merge(y_test, preds_series, left_index=True, right_index=True, how='outer')

In [119]:
# Сравнение реальных и предсказанных значений радиации
results.sample(10)

Unnamed: 0,CM3up[W/m2],CG3up_predicted
251438,103.867575,95.803799
50656,316.909828,358.208216
51123,119.599343,137.78506
245353,64.443386,56.100143
207589,157.601987,137.546548
127089,73.162197,52.332082
8751,149.736103,156.204664
152756,168.78481,133.030507
135121,65.959701,67.787915
189530,115.998095,101.202833


# Сохранение/использование модели и итоговая оценка

In [120]:
from joblib import dump, load
dump(model, 'shortRmodel.joblib')

['shortRmodel.joblib']

In [121]:
CM3up = load('shortRmodel.joblib')

In [122]:
import time

start_time = time.time()
num_samples = len(X_test)
preds = CM3up.predict(X_test)

total_time = time.time() - start_time
average_time = total_time / num_samples

print(f"Общее время выполнения: {total_time:.6f} секунд")
print(f"Среднее время выполнения: {average_time:.6f} секунд")


Общее время выполнения: 0.189584 секунд
Среднее время выполнения: 0.000003 секунд


In [123]:
mae = mean_absolute_error(preds,y_test)
rmse = root_mean_squared_error(preds, y_test)
print('mae:', mae, 'rmse:', rmse)

mae: 31.066587954370554 rmse: 63.18221664999499


In [124]:
std = df.describe().loc['std', 'CM3up[W/m2]']
print('std:', std)

std: 268.8251224422064


**Ошибка в целом адекватна относительно стандартного отклонения во входных данных**