In [None]:
%pip install catboost

In [1]:
%matplotlib inline

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import warnings
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm
from sklearn.linear_model import Ridge
from sklearn.dummy import DummyRegressor
# https://scikit-learn.org/stable/supervised_learning.html
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import Pool, CatBoostRegressor

In [6]:
df = pd.read_csv('/home/jupyter/datasphere/s3/pu2024ws/data/ICOADS_1950th1960th.csv', sep=';', index_col=None)
# df = pd.read_csv('./data/1950-1960/ICOADS_1950th1960th_train.csv', sep=';', index_col=None)

In [7]:
df.shape

(3523182, 16)

In [8]:
df.columns

Index(['YYYY', 'MM', 'DD', 'HHfloat', 'LAT', 'LON', 'TrueWindDirection',
       'WindSpeed', 'PresentWeatherCode', 'SLPhPa', 'AirTemperature', 'SST',
       'TCA', 'LCA', 'LCT', 'rh'],
      dtype='object')

In [9]:
train_features = df.columns[6:-1]
train_features

Index(['TrueWindDirection', 'WindSpeed', 'PresentWeatherCode', 'SLPhPa',
       'AirTemperature', 'SST', 'TCA', 'LCA', 'LCT'],
      dtype='object')

In [10]:
X = df[train_features]
y = df['rh']

## baseline: linear model

In [12]:
Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.3)

In [47]:
m = Ridge()

In [48]:
m.fit(Xtrain, ytrain)

Ridge()

In [15]:
y_pred_train = m.predict(Xtrain)

In [17]:
rmse_train = np.sqrt(mean_squared_error(y_pred_train, ytrain))
rmse_train

10.097706403065759

In [18]:
y_pred_val = m.predict(Xval)

In [19]:
rmse_val = np.sqrt(mean_squared_error(y_pred_val, yval))
rmse_val

10.103147167680442

### Примитивные модели (dummy models)

#### Константная модель (constant model)

In [22]:
constant = ytrain.mean()
constant

80.16401562988939

In [24]:
yval_constant = constant*np.ones((Xval.shape[0],))

In [25]:
rmse_constant_val = np.sqrt(mean_squared_error(yval_constant, yval))
rmse_constant_val

11.30381934386592

#### Модель с произвольным ответом (random model)

In [26]:
ymin, ymax = ytrain.min(), ytrain.max()

In [31]:
yval_random = np.random.rand(Xval.shape[0])*(ymax-ymin) + ymin

In [32]:
rmse_random_val = np.sqrt(mean_squared_error(yval_random, yval))
rmse_random_val

35.481755486555585

### Более корректный способ оценки качества модели

In [39]:
m = DummyRegressor(strategy='mean')
scores = cross_val_score(m, X, y, cv=7, scoring = make_scorer(mean_squared_error))

In [41]:
scores_rmse = np.sqrt(scores)

In [44]:
scores_rmse.mean(), scores_rmse.std()

(11.297883599409849, 0.1554322405045522)

### Применение модели

In [46]:
df_test = pd.read_csv('./data/ICOADS_1970th_TEST-FEATURES.csv', sep=';', index_col=None)

In [50]:
df_test.columns

Index(['YYYY', 'MM', 'DD', 'HHfloat', 'LAT', 'LON', 'TrueWindDirection',
       'WindSpeed', 'PresentWeatherCode', 'SLPhPa', 'AirTemperature', 'SST',
       'TCA', 'LCA', 'LCT'],
      dtype='object')

In [51]:
Xtest = df_test[train_features]

In [52]:
ytest_pred = m.predict(Xtest)

In [56]:
assert ytest_pred.shape == (1773180,), 'something is wrong with the results'

In [58]:
np.save('demo-team-output.v1.npy', ytest_pred)

Файл вычисленных результатов (здесь он называется `demo-team-output.v1.npy`, но имя может быть другим по вашему желанию) - нужно переслать модератору хакатона, который оценит качество модели и внесет результаты в таблицу рекордов