<a href="https://colab.research.google.com/github/Kartel7/DASIO-Model-0.25-Sirius-case-/blob/main/long.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

На входе есть множество характеристик, расчитанных по изображениям специальной камеры, снимающей небосвод. Наша задача: предсказать значение приходящей длинноволновой радиации по имеющемуся датасету с характеристиками этих изображений. Подробнее: https://github.com/MKrinitskiy/Sirius-AI4Climate-2024/blob/main/DASIO-dataset-description.md

In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [162]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [163]:
# Входные данные

df = pd.read_csv("/content/drive/MyDrive/dataset_25perc.csv", sep=',')
df.sample(5)

Unnamed: 0,photo_name,photo_datetime,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],radiation_datetime,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature152,feature153,feature154,feature155,feature156,feature157,feature158,feature159,feature160,feature161,date-hour,datetime_UTC,lat,lon,sun_altitude
56598,img-2015-06-22T16-47-55devID1.jpg,2015-06-22 16:47:55,201.101272,-14.878841,27.957057,3.1274,2015-06-22 16:47:54,96.613462,372.516081,129.0,16.0,-0.903729,-0.529233,1.0,8.0,...,0.401495,0.418936,0.4327,0.444,0.455566,0.464206,0.476669,0.486706,0.492688,0.499867,2015-06-22-16,2015-06-22 16:47:55.536289,59.505299,-26.339182,42.14775
227146,img-2021-08-09T03-40-33devID2.jpg,2021-08-09 03:40:33,50.986091,-18.385319,13.457295,-2.274472,2021-08-09 03:40:39,84.341517,309.026389,127.0,13.0,-0.751449,-0.637338,1.0,7.0,...,0.352575,0.363742,0.372648,0.380691,0.391458,0.405051,0.423728,0.440345,0.465869,0.501396,2021-08-09-03,2021-08-09 03:40:33.163472,64.703408,40.52758,16.073947
199022,img-2017-03-13T12-17-28devID1.jpg,2017-03-13 12:17:28,797.676442,-112.965465,200.722194,52.691945,2017-03-13 12:17:30,78.187741,928.725902,255.0,9.0,1.412215,4.825255,12.0,17.0,...,0.296739,0.311263,0.325487,0.33908,0.356229,0.382284,0.41791,0.473078,0.521666,0.757428,2017-03-13-12,2017-03-13 12:17:27.812283,35.824744,14.855322,48.338637
174115,img-2021-09-06T10-22-54devID2.jpg,2021-09-06 10:22:54,335.484686,-53.923951,94.106298,-2.748321,2021-09-06 10:22:54,115.811298,1303.941165,255.0,19.0,0.130039,-0.050075,5.0,13.0,...,0.425689,0.442871,0.460519,0.480469,0.503191,0.540679,0.590462,0.658558,0.746001,0.888967,2021-09-06-10,2021-09-06 10:22:54.060943,56.006069,12.666925,39.479589
199221,img-2017-03-15T08-44-54devID2.jpg,2017-03-15 08:44:54,570.703046,-125.569833,167.647574,6.254799,2017-03-15 08:44:59,86.904623,1679.733354,255.0,16.0,1.511056,2.990504,15.0,22.0,...,0.311395,0.330173,0.352539,0.376067,0.398467,0.426849,0.480321,0.560547,0.704216,0.942606,2017-03-15-08,2017-03-15 08:44:53.797709,37.610801,7.169363,33.3503


In [164]:
# Удаляем имена изображений
df = df.drop('photo_name', axis = 1)

In [165]:
# Дату в datetime формат
df['photo_datetime'] = pd.to_datetime(df['photo_datetime'])

In [166]:
df['days'] = df['photo_datetime'].dt.day
df['year_date'] = df['photo_datetime'].dt.year

In [167]:
# Вычисляем разницу в секундах между значениями в столбце 'photo_datetime' и начальной датой
df['date_seconds'] = (df['photo_datetime'] - df['photo_datetime'].min()).dt.total_seconds()

In [168]:
df = df.sort_values("date_seconds", axis=0)

In [169]:
df.index=range(0,len(df))

In [170]:
# Переводим линейное представление времени в цикличное (т.е. 31 декабря и 1 января следующего года будут близки несмотря на разницу в год)
df['dateyear'] = (df['photo_datetime'].dt.dayofyear - 1) / 365
df['cos'] = np.cos(df['dateyear'] * 2 * np.pi)
df['sin'] = np.sin(df['dateyear'] * 2 * np.pi)

In [171]:
# Вывод важных полей
df[['CM3up[W/m2]', 'CG3up[W/m2]', 'sin', 'cos', 'lat', 'lon', 'sun_altitude']]

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],sin,cos,lat,lon,sun_altitude
0,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320
1,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320
2,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731
3,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731
4,160.824156,-86.903802,-0.967938,-0.251190,66.588477,-29.294644,14.739936
...,...,...,...,...,...,...,...
260428,182.242105,-52.312866,-0.903356,-0.428892,55.545024,12.705800,30.696305
260429,181.483948,-52.123327,-0.903356,-0.428892,55.544092,12.705888,30.642495
260430,180.915329,-52.028557,-0.903356,-0.428892,55.542581,12.706041,30.552547
260431,180.725790,-52.028557,-0.903356,-0.428892,55.542266,12.706072,30.534529


In [172]:
df

Unnamed: 0,photo_datetime,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],radiation_datetime,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature158,feature159,feature160,feature161,date-hour,datetime_UTC,lat,lon,sun_altitude,days,year_date,date_seconds,dateyear,cos,sin
0,2014-09-17 10:02:43,143.102225,-88.135807,36.391559,0.473848,2014-09-17 10:02:44,96.890459,1671.656306,255.0,31.0,1.121827,2.437911,7.0,16.0,20.0,...,0.493691,0.543301,0.702826,0.973412,2014-09-17-10,2014-09-17 10:02:43.303133,66.584686,-29.280270,14.488320,17,2014,0.0,0.709589,-0.251190,-0.967938
1,2014-09-17 10:02:44,143.102225,-88.135807,36.391559,0.473848,2014-09-17 10:02:44,108.081154,1784.724755,255.0,43.0,0.813509,1.466658,3.0,14.0,19.0,...,0.507907,0.558590,0.708413,0.973412,2014-09-17-10,2014-09-17 10:02:43.303133,66.584686,-29.280270,14.488320,17,2014,1.0,0.709589,-0.251190,-0.967938
2,2014-09-17 10:04:43,159.023532,-87.377650,44.162673,0.379079,2014-09-17 10:04:44,98.760528,1723.527972,255.0,31.0,1.083457,2.204960,8.0,16.0,21.0,...,0.506745,0.558490,0.713127,0.973412,2014-09-17-10,2014-09-17 10:04:43.319997,66.586937,-29.289083,14.655731,17,2014,120.0,0.709589,-0.251190,-0.967938
3,2014-09-17 10:04:44,159.023532,-87.377650,44.162673,0.379079,2014-09-17 10:04:44,109.894952,1814.364325,255.0,42.0,0.787298,1.310879,5.0,15.0,20.0,...,0.521335,0.576076,0.719611,0.973412,2014-09-17-10,2014-09-17 10:04:43.319997,66.586937,-29.289083,14.655731,17,2014,121.0,0.709589,-0.251190,-0.967938
4,2014-09-17 10:05:45,160.824156,-86.903802,44.636522,0.568618,2014-09-17 10:05:44,111.069528,1877.696783,255.0,44.0,0.787653,1.232283,6.0,15.0,20.0,...,0.525156,0.585410,0.734166,0.973412,2014-09-17-10,2014-09-17 10:05:44.623504,66.588477,-29.294644,14.739936,17,2014,182.0,0.709589,-0.251190,-0.967938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260428,2021-09-06 13:53:28,182.242105,-52.312866,60.463059,-3.601248,2021-09-06 13:53:34,93.887588,205.434115,135.0,11.0,-1.053254,-0.293065,1.0,7.0,14.0,...,0.437321,0.449086,0.464208,0.484048,2021-09-06-13,2021-09-06 13:53:28.099585,55.545024,12.705800,30.696305,6,2021,219988245.0,0.679452,-0.428892,-0.903356
260429,2021-09-06 13:53:58,181.483948,-52.123327,60.083981,-3.506478,2021-09-06 13:54:04,93.766927,205.713884,131.0,18.0,-1.053141,-0.291374,1.0,7.0,14.0,...,0.438618,0.449086,0.462812,0.481821,2021-09-06-13,2021-09-06 13:53:58.111208,55.544092,12.705888,30.642495,6,2021,219988275.0,0.679452,-0.428892,-0.903356
260430,2021-09-06 13:54:48,180.915329,-52.028557,59.894441,-3.601248,2021-09-06 13:54:54,93.492497,209.849320,133.0,18.0,-1.044951,-0.300195,1.0,7.0,13.0,...,0.439382,0.449551,0.461316,0.480127,2021-09-06-13,2021-09-06 13:54:48.125231,55.542581,12.706041,30.552547,6,2021,219988325.0,0.679452,-0.428892,-0.903356
260431,2021-09-06 13:54:58,180.725790,-52.028557,59.894441,-3.601248,2021-09-06 13:55:04,94.056191,205.642272,144.0,31.0,-1.050130,-0.200994,0.0,7.0,12.0,...,0.442174,0.452676,0.464772,0.484613,2021-09-06-13,2021-09-06 13:54:58.128647,55.542266,12.706072,30.534529,6,2021,219988335.0,0.679452,-0.428892,-0.903356


In [173]:
# Поскольку время уже переведено в значения cos и sin, удаляем все признаки, где упоминается date
df = df.loc[:, ~df.columns.str.contains('date')]

In [174]:
df_corr = df.corr()['CG3up[W/m2]'].sort_values()
print(df_corr)

feature4      -0.864836
feature31     -0.836236
feature139    -0.799586
feature132    -0.794878
feature80     -0.794497
                 ...   
CG3up[W/m2]    1.000000
feature114          NaN
feature115          NaN
feature116          NaN
feature117          NaN
Name: CG3up[W/m2], Length: 172, dtype: float64


In [175]:
df.head()

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature153,feature154,feature155,feature156,feature157,feature158,feature159,feature160,feature161,lat,lon,sun_altitude,days,cos,sin
0,143.102225,-88.135807,36.391559,0.473848,96.890459,1671.656306,255.0,31.0,1.121827,2.437911,7.0,16.0,20.0,26.0,51.0,...,0.385274,0.397735,0.411894,0.430139,0.457923,0.493691,0.543301,0.702826,0.973412,66.584686,-29.28027,14.48832,17,-0.25119,-0.967938
1,143.102225,-88.135807,36.391559,0.473848,108.081154,1784.724755,255.0,43.0,0.813509,1.466658,3.0,14.0,19.0,24.0,52.0,...,0.39797,0.410332,0.424855,0.441672,0.46936,0.507907,0.55859,0.708413,0.973412,66.584686,-29.28027,14.48832,17,-0.25119,-0.967938
2,159.023532,-87.37765,44.162673,0.379079,98.760528,1723.527972,255.0,31.0,1.083457,2.20496,8.0,16.0,21.0,27.0,53.0,...,0.389096,0.401093,0.415949,0.435069,0.464571,0.506745,0.55849,0.713127,0.973412,66.586937,-29.289083,14.655731,17,-0.25119,-0.967938
3,159.023532,-87.37765,44.162673,0.379079,109.894952,1814.364325,255.0,42.0,0.787298,1.310879,5.0,15.0,20.0,25.0,55.0,...,0.401559,0.413756,0.428647,0.44606,0.475736,0.521335,0.576076,0.719611,0.973412,66.586937,-29.289083,14.655731,17,-0.25119,-0.967938
4,160.824156,-86.903802,44.636522,0.568618,111.069528,1877.696783,255.0,44.0,0.787653,1.232283,6.0,15.0,20.0,25.0,56.0,...,0.4039,0.416449,0.43097,0.448153,0.478827,0.525156,0.58541,0.734166,0.973412,66.588477,-29.294644,14.739936,17,-0.25119,-0.967938


## Задача регрессии

In [176]:
X = df.drop(['CM3up[W/m2]','CG3up[W/m2]','CM3down[W/m2]','CG3down[W/m2]'], axis = 1)
y = df['CG3up[W/m2]']

# CG3up[W/m2] --> Приходящая длинноволновая радиация

In [177]:
# Разбиваем данные на выборки так, чтобы одни дни были в тренировочной выборке, другие - в тестовой

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=0.1, random_state=38)

for train_index, test_index in gss.split(X, y, groups=df['days']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [178]:
# Проверка пересекающихся в рамках дня данных

X_test["days"].unique()

array([21, 15, 27,  9], dtype=int32)

In [179]:
X_train["days"].unique()

# Данные не пересекаются

array([17, 18, 19, 20, 22, 23, 24, 25, 12, 13, 14, 16, 26, 28, 29, 30,  1,
        2,  3,  4,  5,  6,  7,  8, 11, 31, 10], dtype=int32)

In [180]:
pip install -r '/content/drive/MyDrive/Colab Notebooks/Sirius project/requirements.txt'

Collecting numpy (from xgboost==2.1.1->-r /content/drive/MyDrive/Colab Notebooks/Sirius project/requirements.txt (line 2))
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[0mInstalling collected packages: numpy
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy


In [181]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
#from sklearn.model_selection import RandomizedSearchCV -- параметры подбирались через RandomizedSearchCV и вручную

In [182]:
from xgboost import XGBRegressor

In [183]:
# Задаем оптимальные параметры, подобранные перебором вручную
etr = XGBRegressor(random_state=35, max_depth=7, learning_rate=0.05, gamma=0.2, n_estimators=1000,colsample_bytree=0.6 )
model = etr

In [184]:
model.fit(X_train, y_train)

In [185]:
preds = model.predict(X_test)

In [186]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

In [187]:
mean_absolute_percentage_error(preds,y_test)

0.31441031450435364

In [188]:
r2_score(preds,y_test)

0.8588887921750898

In [189]:
mean_absolute_error(preds,y_test)

8.705714824725158

In [190]:
root_mean_squared_error(preds,y_test)

12.779780260678477

In [191]:
preds_series = pd.Series(preds)
preds_series.index = y_test.index
preds_series.name = 'CG3up_predicted'
preds_series

Unnamed: 0,CG3up_predicted
664,-19.258133
665,-17.598831
666,-17.302557
667,-15.449218
668,-19.053596
...,...
241589,-6.290244
241590,-10.287186
241591,-6.789811
241592,-9.410515


In [192]:
y_test

Unnamed: 0,CG3up[W/m2]
664,-29.473372
665,-36.012480
666,-33.643238
667,-33.643238
668,-30.231530
...,...
241589,-5.970490
241590,-5.875720
241591,-5.780951
241592,-5.970490


In [193]:
results = pd.merge(y_test, preds_series, left_index=True, right_index=True, how='outer')

In [194]:
# Сравнение реальных и предсказанных значений радиации
results.sample(10)

Unnamed: 0,CG3up[W/m2],CG3up_predicted
31897,-39.708498,-53.991409
75198,-53.450103,-62.929775
86299,-31.558305,-41.296215
79265,-69.27664,-58.640255
32403,-87.851498,-75.621193
58029,-40.371886,-51.440121
224192,-16.016077,-13.412304
58115,-39.23465,-30.521265
96204,-74.109894,-56.382122
210311,-6.06526,-7.42273


# Сохранение/использование модели и итоговая оценка

In [195]:
from joblib import dump, load
dump(etr, 'longRmodel.joblib')

['longRmodel.joblib']

In [196]:
CG3up = load('longRmodel.joblib')

In [197]:
import time

start_time = time.time()
num_samples = len(X_test)
preds = CG3up.predict(X_test)

total_time = time.time() - start_time
average_time = total_time / num_samples

print(f"Общее время выполнения: {total_time:.6f} секунд")
print(f"Среднее время выполнения: {average_time:.6f} секунд")

Общее время выполнения: 1.371862 секунд
Среднее время выполнения: 0.000043 секунд


In [198]:
mae = mean_absolute_error(preds,y_test)
rmse = root_mean_squared_error(preds, y_test)
print('mae:', mae, 'rmse:', rmse)

mae: 8.705714824725158 rmse: 12.779780260678477


In [199]:
std = df.describe().loc['std', 'CG3up[W/m2]']
print('std:', std)

std: 37.96773051574533


**Ошибка в целом адекватна относительно стандартного отклонения во входных данных**