На входе есть множество характеристик, расчитанных по изображениям специальной камеры, снимающей небосвод. Наша задача: предсказать значение приходящей длинноволновой радиации по имеющемуся датасету с характеристиками этих изображений. Подробнее: https://github.com/MKrinitskiy/Sirius-AI4Climate-2024/blob/main/DASIO-dataset-description.md

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
df = pd.read_csv("/content/drive/MyDrive/dataset_25perc.csv", sep=',')
df.sample(5)

Unnamed: 0,photo_name,photo_datetime,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],radiation_datetime,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature152,feature153,feature154,feature155,feature156,feature157,feature158,feature159,feature160,feature161,date-hour,datetime_UTC,lat,lon,sun_altitude
21242,img-2018-07-28T08-33-54devID2.jpg,2018-07-28 08:33:54,49.564545,-84.34502,18.385319,-1.042467,2018-07-28 08:33:59,86.631694,669.904954,255.0,12.0,0.355601,1.043909,2.0,10.0,...,0.356924,0.375834,0.397534,0.420499,0.445201,0.472609,0.509001,0.550676,0.613258,0.75829,2018-07-28-08,2018-07-28 08:33:54.258264,66.324793,-28.080846,20.815095
150200,img-2016-10-28T14-01-47devID2.jpg,2016-10-28 14:01:47,251.803054,-7.012957,57.051351,0.568618,2016-10-28 14:01:49,115.310021,844.177482,170.0,12.0,-0.700396,-0.701354,2.0,10.0,...,0.475174,0.490395,0.506347,0.522797,0.540444,0.564473,0.589131,0.610468,0.639315,0.663675,2016-10-28-14,2016-10-28 14:04:16.988320,-36.69927,-42.641982,65.728914
180402,img-2018-08-10T13-59-48devID2.jpg,2018-08-10 13:59:48,52.597175,-12.509598,7.202496,-6.444339,2018-08-10 13:59:54,84.601451,189.999909,116.0,38.0,-0.915799,-0.480628,0.0,3.0,...,0.342173,0.354071,0.366167,0.37853,0.390827,0.400664,0.408274,0.416351,0.427418,0.443469,2018-08-10-13,2018-08-10 13:59:46.912507,78.67409,53.678401,16.669055
83197,img-2017-03-21T15-38-33devID2.jpg,2017-03-21 15:38:33,411.395204,-102.161721,139.785286,13.646835,2017-03-21 15:38:39,88.958109,1468.618306,255.0,12.0,1.262612,3.21208,8.0,16.0,...,0.313856,0.3324,0.355198,0.377035,0.396308,0.416315,0.447323,0.509935,0.618775,0.961283,2017-03-21-15,2017-03-21 15:38:33.059982,47.868943,-6.30703,27.861909
185729,img-2016-10-22T08-59-37devID1.jpg,2016-10-22 08:59:37,99.602939,-66.433549,25.208736,-4.738484,2016-10-22 08:59:39,96.658498,2491.509651,255.0,5.0,1.48305,2.524546,9.0,15.0,...,0.359717,0.37826,0.398068,0.418875,0.442302,0.477397,0.523293,0.629441,0.866871,0.984047,2016-10-22-08,2016-10-22 09:00:06.596716,-22.341183,-31.782758,19.966598


In [33]:
# Удаляем имена изображений
df = df.drop('photo_name', axis = 1)

In [34]:
# Дату в datetime формат
df['photo_datetime'] = pd.to_datetime(df['photo_datetime'])

In [36]:
df['days'] = df['photo_datetime'].dt.day
df['year_date'] = df['photo_datetime'].dt.year

In [37]:
# Вычисляем разницу в секундах между значениями в столбце 'photo_datetime' и начальной датой
df['date_seconds'] = (df['photo_datetime'] - df['photo_datetime'].min()).dt.total_seconds()

In [38]:
df = df.sort_values("date_seconds", axis=0)

In [39]:
df.index=range(0,len(df))

In [40]:
# Переводим линейное представление времени в цикличное (т.е. 31 декабря и 1 января следующего года будут близки несмотря на разницу в год)
df['dateyear'] = (df['photo_datetime'].dt.dayofyear - 1) / 365
df['cos'] = np.cos(df['dateyear'] * 2 * np.pi)
df['sin'] = np.sin(df['dateyear'] * 2 * np.pi)

In [102]:
# Вывод значений для указанных столбцов
df[['CM3up[W/m2]', 'CG3up[W/m2]', 'sin', 'cos', 'lat', 'lon', 'sun_altitude']]


Unnamed: 0,CM3up[W/m2],CG3up[W/m2],sin,cos,lat,lon,sun_altitude
0,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320
1,143.102225,-88.135807,-0.967938,-0.251190,66.584686,-29.280270,14.488320
2,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731
3,159.023532,-87.377650,-0.967938,-0.251190,66.586937,-29.289083,14.655731
4,160.824156,-86.903802,-0.967938,-0.251190,66.588477,-29.294644,14.739936
...,...,...,...,...,...,...,...
260428,182.242105,-52.312866,-0.903356,-0.428892,55.545024,12.705800,30.696305
260429,181.483948,-52.123327,-0.903356,-0.428892,55.544092,12.705888,30.642495
260430,180.915329,-52.028557,-0.903356,-0.428892,55.542581,12.706041,30.552547
260431,180.725790,-52.028557,-0.903356,-0.428892,55.542266,12.706072,30.534529


In [42]:
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

In [43]:
# Поскольку время уже переведено в значения cos и sin, удаляем все признаки, где упоминается date
df = df.loc[:, ~df.columns.str.contains('date')]

In [44]:
df

Unnamed: 0,CM3up[W/m2],CG3up[W/m2],CM3down[W/m2],CG3down[W/m2],feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature153,feature154,feature155,feature156,feature157,feature158,feature159,feature160,feature161,lat,lon,sun_altitude,days,cos,sin
0,143.102225,-88.135807,36.391559,0.473848,96.890459,1671.656306,255.0,31.0,1.121827,2.437911,7.0,16.0,20.0,26.0,51.0,...,0.385274,0.397735,0.411894,0.430139,0.457923,0.493691,0.543301,0.702826,0.973412,66.584686,-29.280270,14.488320,17,-0.251190,-0.967938
1,143.102225,-88.135807,36.391559,0.473848,108.081154,1784.724755,255.0,43.0,0.813509,1.466658,3.0,14.0,19.0,24.0,52.0,...,0.397970,0.410332,0.424855,0.441672,0.469360,0.507907,0.558590,0.708413,0.973412,66.584686,-29.280270,14.488320,17,-0.251190,-0.967938
2,159.023532,-87.377650,44.162673,0.379079,98.760528,1723.527972,255.0,31.0,1.083457,2.204960,8.0,16.0,21.0,27.0,53.0,...,0.389096,0.401093,0.415949,0.435069,0.464571,0.506745,0.558490,0.713127,0.973412,66.586937,-29.289083,14.655731,17,-0.251190,-0.967938
3,159.023532,-87.377650,44.162673,0.379079,109.894952,1814.364325,255.0,42.0,0.787298,1.310879,5.0,15.0,20.0,25.0,55.0,...,0.401559,0.413756,0.428647,0.446060,0.475736,0.521335,0.576076,0.719611,0.973412,66.586937,-29.289083,14.655731,17,-0.251190,-0.967938
4,160.824156,-86.903802,44.636522,0.568618,111.069528,1877.696783,255.0,44.0,0.787653,1.232283,6.0,15.0,20.0,25.0,56.0,...,0.403900,0.416449,0.430970,0.448153,0.478827,0.525156,0.585410,0.734166,0.973412,66.588477,-29.294644,14.739936,17,-0.251190,-0.967938
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260428,182.242105,-52.312866,60.463059,-3.601248,93.887588,205.434115,135.0,11.0,-1.053254,-0.293065,1.0,7.0,14.0,19.0,44.0,...,0.396278,0.403622,0.410800,0.419342,0.427650,0.437321,0.449086,0.464208,0.484048,55.545024,12.705800,30.696305,6,-0.428892,-0.903356
260429,181.483948,-52.123327,60.083981,-3.506478,93.766927,205.713884,131.0,18.0,-1.053141,-0.291374,1.0,7.0,14.0,19.0,44.0,...,0.395813,0.402492,0.410302,0.419309,0.427750,0.438618,0.449086,0.462812,0.481821,55.544092,12.705888,30.642495,6,-0.428892,-0.903356
260430,180.915329,-52.028557,59.894441,-3.601248,93.492497,209.849320,133.0,18.0,-1.044951,-0.300195,1.0,7.0,13.0,19.0,43.0,...,0.394184,0.401894,0.409738,0.419309,0.429345,0.439382,0.449551,0.461316,0.480127,55.542581,12.706041,30.552547,6,-0.428892,-0.903356
260431,180.725790,-52.028557,59.894441,-3.601248,94.056191,205.642272,144.0,31.0,-1.050130,-0.200994,0.0,7.0,12.0,19.0,49.0,...,0.392455,0.400764,0.410335,0.420971,0.431439,0.442174,0.452676,0.464772,0.484613,55.542266,12.706072,30.534529,6,-0.428892,-0.903356


In [45]:
df_corr = df.corr()['CM3up[W/m2]'].sort_values()
print(df_corr)

CG3up[W/m2]   -0.524952
lat           -0.465449
feature84     -0.273190
feature111    -0.237629
feature143    -0.160054
                 ...   
CM3up[W/m2]    1.000000
feature114          NaN
feature115          NaN
feature116          NaN
feature117          NaN
Name: CM3up[W/m2], Length: 172, dtype: float64


In [106]:
# Избавляемся от излишнего множества признаков
heads = ['days']
for index in range(len(df_corr)):
    if df_corr[index] <= -0.13 or df_corr[index] >= 0.27:
        heads.append(df_corr.index[index])

  if df_corr[index] <= -0.13 or df_corr[index] >= 0.27:


In [47]:
df = df[heads]

In [48]:
df.head()

Unnamed: 0,days,CG3up[W/m2],lat,feature84,feature111,feature143,feature82,feature85,lon,feature113,feature156,sin,feature38,feature122,feature22,...,feature60,CM3down[W/m2],feature8,feature80,feature62,feature7,feature61,feature6,feature35,feature63,feature33,feature36,feature34,sun_altitude,CM3up[W/m2]
0,17,-88.135807,66.584686,0.0,0.0,0.2627,1866.19412,-0.963378,-29.28027,-1.326225,0.430139,-0.967938,75.0,0.348929,107.0,...,0.0,36.391559,20.0,245.0,12.0,16.0,0.0,7.0,16.0,30.0,4.0,20.0,12.0,14.48832,143.102225
1,17,-88.135807,66.584686,0.0,0.0,0.2627,5148.994278,-0.720374,-29.28027,-0.485106,0.441672,-0.967938,74.0,0.200052,119.0,...,0.0,36.391559,19.0,252.0,22.0,14.0,6.0,3.0,19.0,34.0,8.0,23.0,15.0,14.48832,143.102225
2,17,-87.37765,66.586937,0.0,0.0,0.2627,1865.162874,-0.962164,-29.289083,-1.298162,0.435069,-0.967938,76.0,0.324985,110.0,...,0.0,44.162673,21.0,252.0,13.0,16.0,1.0,8.0,17.0,30.0,5.0,21.0,13.0,14.655731,159.023532
3,17,-87.37765,66.586937,0.0,0.0,0.2627,5312.158968,-0.704322,-29.289083,-0.39784,0.44606,-0.967938,76.0,0.184925,121.0,...,0.0,44.162673,20.0,255.0,22.0,15.0,8.0,5.0,20.0,34.0,9.0,23.0,16.0,14.655731,159.023532
4,17,-86.903802,66.588477,0.0,0.0,0.2627,5622.794077,-0.679866,-29.294644,-0.375024,0.448153,-0.967938,76.0,0.17754,122.0,...,0.0,44.636522,20.0,255.0,22.0,15.0,8.0,6.0,20.0,34.0,9.0,24.0,16.0,14.739936,160.824156


## Задача регрессии

In [51]:
X = df.drop(['CM3up[W/m2]','CG3up[W/m2]','CM3down[W/m2]','CG3down[W/m2]'], axis = 1)
y = df['CG3up[W/m2]']

#CG3up[W/m2] --> Приходящая длинноволновая радиация

In [52]:
# Разбиваем данные на выборки

from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=2, test_size=0.1, random_state=38)

for train_index, test_index in gss.split(X, y, groups=df['days']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [53]:
X_test["days"].unique()

array([21, 15, 27,  9], dtype=int32)

In [54]:
X_train = X_train.loc[:, ~X_train.columns.str.contains('date')]
X_test = X_test.loc[:, ~X_test.columns.str.contains('date')]
y_train = y_train
y_test = y_test

In [103]:
pip install -r '/content/drive/MyDrive/Colab Notebooks/Sirius project/requirements.txt'

Collecting numpy (from xgboost==2.1.1->-r /content/drive/MyDrive/Colab Notebooks/Sirius project/requirements.txt (line 2))
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[0mInstalling collected packages: numpy
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy


In [57]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
#from sklearn.model_selection import RandomizedSearchCV -- параметры подбирались через RandomizedSearchCV и вручную

In [58]:
from xgboost import XGBRegressor

In [59]:
etr = XGBRegressor(random_state=35, max_depth=7, learning_rate=0.05, gamma=0.2, n_estimators=1000,colsample_bytree=0.6 )
model = etr

In [60]:
model.fit(X_train, y_train)

In [61]:
preds = model.predict(X_test)

In [62]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

In [108]:
mean_absolute_percentage_error(preds,y_test)

0.31498909500189276

In [64]:
r2_score(preds,y_test)

0.8559470127394013

In [65]:
mean_absolute_error(preds,y_test)

8.831688064095223

In [66]:
root_mean_squared_error(preds,y_test)

12.878978226612217

In [107]:
preds_series = pd.Series(preds)
preds_series.index = y_test.index
preds_series.name = 'CG3up_predicted'
preds_series

Unnamed: 0,CG3up_predicted
664,-13.531843
665,-15.509576
666,-16.877083
667,-15.106882
668,-16.716005
...,...
241589,-6.812010
241590,-10.130283
241591,-7.381950
241592,-10.847927


In [73]:
y_test

Unnamed: 0,CG3up[W/m2]
664,-29.473372
665,-36.012480
666,-33.643238
667,-33.643238
668,-30.231530
...,...
241589,-5.970490
241590,-5.875720
241591,-5.780951
241592,-5.970490


In [87]:
results = pd.merge(y_test, preds_series, left_index=True, right_index=True, how='outer')

In [88]:
# Сравнение реальных и предсказанных значений радиации
results.sample(10)

Unnamed: 0,CG3up[W/m2],CG3up_predicted
165596,-114.48178,-95.46199
241214,-7.771114,-7.058854
58063,-43.404516,-35.774422
86541,-99.697709,-106.213837
210831,-1.421545,-5.980799
4275,-50.417473,-62.146751
226276,-14.404992,-9.126552
54636,-40.940504,-31.800159
211470,-7.297266,-8.6321
95754,-42.741128,-40.202896


# Сохранение модели и оценка

In [109]:
from joblib import dump, load
dump(etr, 'longRmodel.joblib')

['longRmodel.joblib']

In [110]:
CG3up = load('longRmodel.joblib')

In [111]:
preds = CG3up.predict(X_test)

In [112]:
mae = mean_absolute_error(preds,y_test)
rmse = root_mean_squared_error(preds, y_test)
print('mae:', mae, 'rmse:', rmse)

mae: 8.831688064095223 rmse: 12.878978226612217


In [105]:
std = df.describe().loc['std', 'CG3up[W/m2]']
print('std:', std)

std: 37.96773051574533


**Ошибка в целом адекватна относительно стандартного отклонения во входных данных**