In [None]:
pip install pyarrow

^C
Note: you may need to restart the kernel to use updated packages.


Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-19.0.1-cp39-cp39-win_amd64.whl (25.5 MB)
     --------------------------------------- 25.5/25.5 MB 76.6 kB/s eta 0:00:00
Installing collected packages: pyarrow
Successfully installed pyarrow-19.0.1


In [9]:

import warnings
warnings.simplefilter('ignore')
import gc
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
import polars as pl
from tqdm import tqdm
from sklearn.model_selection import KFold
import catboost as ctb

def process(filepath):
    df = pl.read_csv(filepath)
    df = df[['TIME', 'V']].with_columns(
        pl.col('TIME').str.to_datetime("%Y-%m-%d %H:%M:%S").alias('datetime')
    ).filter(
        pl.col('V') >= 0
    )
    df = df.with_columns(
        pl.col('datetime').dt.year().alias('year'),
        pl.col('datetime').dt.month().alias('month'),
        pl.col('datetime').dt.day().alias('day'),
        pl.col('datetime').dt.hour().alias('hour'),
    )
    return df

def process_data(filepath):
    data = pl.read_csv(filepath)
    data = data[['SENID', 'TIME', 'V']].with_columns(
        pl.col('TIME').str.to_datetime("%Y-%m-%d %H:%M:%S").alias('datetime')
    )
    data = data.with_columns(
        pl.col('datetime').dt.year().alias('year'),
        pl.col('datetime').dt.month().alias('month'),
        pl.col('datetime').dt.day().alias('day'),
        pl.col('datetime').dt.hour().alias('hour'),
        pl.col('datetime').dt.minute().alias('minute'),
    )
    data = data.sort('datetime').filter(
        pl.col('minute') == 0
    ).drop(['minute'])
    return data

def make_features(data):
    features = data.group_by('datetime').agg(
        pl.col('V').sum().alias('v_sum'),
        pl.col('V').mean().alias('v_mean'),
        pl.col('V').std().alias('v_std'),
        pl.col('V').max().alias('v_max'),
        pl.col('V').min().alias('v_min'),
        pl.col('V').skew().alias('v_skew')
    )
    features = features.with_columns(
        pl.col('v_sum').diff().alias('v_sum_diff'),
        pl.col('v_mean').diff().alias('v_mean_diff')
    )
    w = 8 
    for _ in range(90):
        features = features.with_columns(
            pl.col('v_mean').rolling_mean(w).alias(f'v_mean_rolling_mean_{w}'),
        )
        w += 8
    return features
    
train_df = process('./A榜/train/A-入库流量（2014-2019）.csv')
train_data = process_data('./A榜/train/A-雨量水位（2014-2019）.csv')
features = make_features(train_data)
train_df = train_df.join(features, on='datetime', how='left')

feature_names = [c for c in train_df.columns if c not in ['TIME', 'V', 'datetime', 'year']]
df_train = train_df.to_pandas()

df_train = train_df.filter(
    pl.col('year') != 2019
)

df_valid = train_df.filter(
    pl.col('year') == 2019
)

x_train = df_train[feature_names].to_numpy()
y_train = df_train['V'].to_numpy()
x_valid = df_valid[feature_names].to_numpy()
y_valid = df_valid['V'].to_numpy()



ModuleNotFoundError: No module named 'pyarrow'

In [None]:
ctb_model = ctb.CatBoostRegressor(
    task_type="CPU",  
    learning_rate=0.1,    
    grow_policy='Lossguide',
    n_estimators=1000,
)
ctb_model.fit(
    x_train, y_train,
    eval_set=(x_valid, y_valid),
    verbose=False
)
ctb_val_pred = ctb_model.predict(x_valid)

score = 1 / (root_mean_squared_error(df_valid['V'], ctb_val_pred) + 1)
print('score:', score)

# 全量数据再训练一次

x_train = train_df[feature_names].to_numpy()
y_train = train_df['V'].to_numpy()

ctb_model = ctb.CatBoostRegressor(
    task_type="CPU",  
    learning_rate=0.1,    
    grow_policy='Lossguide',
    n_estimators=1000,
)
ctb_model.fit(
    x_train, y_train,
    verbose=False
)

test = pl.datetime_range(start=pl.datetime(2020, 1, 1, 0, 0, 0), 
                         end=pl.datetime(2021, 12, 31, 23, 59, 59), 
                         interval='1h',
                         eager=True).alias('datetime').to_frame().with_columns(
                            pl.col('datetime').dt.month().alias('month'),
                            pl.col('datetime').dt.day().alias('day'),
                            pl.col('datetime').dt.hour().alias('hour'),
                         )
test = test.with_columns(
    pl.col('datetime').dt.strftime('%Y-%m-%d %H:%M:%S').alias('TIME')
)

test_data = process_data('./data/test/A-雨量水位（2020-2021）.csv')
features = make_features(test_data)
test = test.join(features, on='datetime', how='left')

pred_test = ctb_model.predict(test[feature_names].to_pandas())
test = test.with_columns(
    pl.lit(pred_test).alias('V')    
)

sub = test.with_columns(
    pl.col('datetime').dt.strftime('%Y-%m-%d %H:%M:%S').alias('TIME')
).select(
    pl.col('TIME'),
    pl.col('V')
)
sub.write_csv(f'ctb_baseline_{score}.csv')