In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [3]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor

def fit():
    data = pd.read_csv('/content/drive/MyDrive/AgroHack/data/train.csv')
    pedigree = pd.read_csv('/content/drive/MyDrive/AgroHack/data/pedigree.csv')

    # преведем к datetime
    data['calving_date'] = pd.to_datetime(data['calving_date'])
    data['birth_date'] = pd.to_datetime(data['birth_date'])

    # стоит попробовать на данных после 2016
    data = data[data['calving_date'].dt.year >= 2020]

    # новые фичи
    data['age'] = (data['calving_date'] - data['birth_date']).dt.days
    data['calving_month'] = data['calving_date'].dt.month

    targets = ['milk_yield_' + str(i) for i in range(3, 11)]

    def get_average_previous(targets, lactation):
        return data[data['lactation'] < lactation].groupby('animal_id')[targets].mean().to_dict()

    for target in targets:
        data['mean_previous_' + target] = [-1 for i in range(len(data))]
        for lactation in [2, 3, 4]:
            dicts = get_average_previous(target, lactation)
            data['mean_previous_' + target] = (data['mean_previous_' + target]).where(data['lactation'] < lactation, data['animal_id'].apply(lambda x: dicts[x] if x in dicts.keys() else -1))

    amount_of_children = pedigree.groupby('mother_id')['animal_id'].count().to_dict()

    data['amount_children'] = data['animal_id'].apply(lambda x: amount_of_children[x] if x in amount_of_children.keys() else 0)



    #  кол-во дней с предыдщей лактации
    # data = data.sort_values(by=['animal_id', 'lactation'])
    # data['date_diff'] = data.groupby('animal_id')['calving_date'].diff().dt.days
    # data['date_diff'] = data['date_diff'].fillna(-1)

    # Потом надо убрать, теряю много данных
    data.dropna(subset=targets, inplace=True)

    # дропаю даты, а может и не надо
    # data.drop(['calving_date', 'birth_date'], axis=1, inplace=True)

    # определим категориальные фичи
    cat_features = ['lactation', 'farm', 'farmgroup', 'animal_id']
    data[cat_features] = data[cat_features].astype('category')

    # параметры для моделей

    n_estimators = [5, 3, 4, 5, 4, 3, 4, 3]

    params_cat = [{
        "n_estimators": n_estimators[i],
        #'learning_rate':0.02,
        'loss_function': 'MAE',
        # max depth
        "depth": 6,
        # features
        'colsample_bylevel': 0.8,
        # samples
        'subsample': 0.8,
        # 'early_stopping_rounds': 30,
        'cat_features': cat_features,
        # 'verbose':100,
        'random_state': 777,
        'thread_count': -1,
        # НА ИНФЕРЕНСЕ УДАЛИТЬ
        # 'task_type':'GPU',
    } for i in range(len(targets))]

    params_cat_1 = [{
        "n_estimators": [4000 for i in range(8)],
        #'learning_rate':0.02,
        'loss_function': 'RMSE',
        # max depth
        "depth": 6,
        # features
        'colsample_bylevel': 0.8,
        # samples
        'subsample': 0.8,
        # 'early_stopping_rounds': 30,
        'cat_features': cat_features,
        # 'verbose':100,
        'random_state': 1414,
        'thread_count': -1,
        # НА ИНФЕРЕНСЕ УДАЛИТЬ
        # 'task_type':'GPU',
    } for i in range(len(targets))]

    models = []
    targets = ['milk_yield_' + str(i) for i in range(3, 11)]

    # на каждый таргет отдельную модель
    for index, target in enumerate(targets):
        temp = []

        cat = CatBoostRegressor(**params_cat[index])
        cat.fit(data.drop(targets, axis=1), data[target])
        temp.append(cat)

        cat = CatBoostRegressor(**params_cat_1[index])
        cat.fit(data.drop(targets, axis=1), data[target])

        temp.append(cat)

        models.append(temp)

    return models


def predict(models, test_dataset_path):
    test = pd.read_csv('/content/drive/MyDrive/AgroHack/data/X_test_public.csv')
    data = pd.read_csv('/content/drive/MyDrive/AgroHack/data/train.csv')
    pedigree = pd.read_csv('/content/drive/MyDrive/AgroHack/data/pedigree.csv')

    # преведем к datetime
    test['calving_date'] = pd.to_datetime(test['calving_date'])
    test['birth_date'] = pd.to_datetime(test['birth_date'])

    # новые фичи
    test['age'] = (test['calving_date'] - test['birth_date']).dt.days
    test['calving_month'] = test['calving_date'].dt.month

    targets = ['milk_yield_' + str(i) for i in range(3, 11)]

    def get_average_previous(targets, lactation):
        return data[data['lactation'] < lactation].groupby('animal_id')[targets].mean().to_dict()

    for target in targets:
        test['mean_previous_' + target] = [-1 for i in range(len(test))]
        for lactation in [2, 3, 4]:
            dicts = get_average_previous(target, lactation)
            test['mean_previous_' + target] = (test['mean_previous_' + target]).where(test['lactation'] < lactation, test['animal_id'].apply(lambda x: dicts[x] if x in dicts.keys() else -1))

    amount_of_children = pedigree.groupby('mother_id')['animal_id'].count().to_dict()

    test['amount_children'] = test['animal_id'].apply(lambda x: amount_of_children[x] if x in amount_of_children.keys() else 0)

    # кол-во дней с предыдщей лактации
    # test = test.sort_values(by=['animal_id', 'lactation'])
    # test['date_diff'] = test.groupby('animal_id')['calving_date'].diff().dt.days
    # test['date_diff'] = test['date_diff'].fillna(-1)

    # определим категориальные фичи
    cat_features = ['lactation', 'farm', 'farmgroup', 'animal_id']
    test[cat_features] = test[cat_features].astype('category')

    # дропаю даты, а может и не надо
    # test.drop(['calving_date', 'birth_date'], axis=1, inplace=True)


    if 'Unnamed: 0' in test.columns:
        test.drop('Unnamed: 0', axis=1, inplace=True)

    for i in range(len(models)):
        pred_0 = models[i][0].predict(test)
        pred_1 = models[i][1].predict(test)
        test['milk_yield_' + str(i + 3)] = (pred_0 + pred_1) / 2

    test = test[['animal_id', 'lactation', 'milk_yield_3', 'milk_yield_4', 'milk_yield_5', 'milk_yield_6', 'milk_yield_7', 'milk_yield_8', 'milk_yield_9', 'milk_yield_10']]

    return test

if __name__ == '__main__':
    _model = fit()

    _submission = predict(_model, os.path.join('data', 'X_test_public.csv'))
    _submission.to_csv(os.path.join('data', 'submission.csv'), sep=',', index=False)


0:	learn: 8.7997710	total: 31.4ms	remaining: 125ms
1:	learn: 8.6125178	total: 50.5ms	remaining: 75.8ms
2:	learn: 8.4437550	total: 86.4ms	remaining: 57.6ms
3:	learn: 8.2732845	total: 114ms	remaining: 28.6ms
4:	learn: 8.1136572	total: 149ms	remaining: 0us
0:	learn: 8.8023600	total: 41.9ms	remaining: 2.26s
1:	learn: 8.6302008	total: 81.9ms	remaining: 2.17s
2:	learn: 8.4737678	total: 117ms	remaining: 2.02s
3:	learn: 8.3046024	total: 147ms	remaining: 1.87s
4:	learn: 8.1525640	total: 170ms	remaining: 1.7s
5:	learn: 7.9964291	total: 194ms	remaining: 1.59s
6:	learn: 7.8920797	total: 216ms	remaining: 1.48s
7:	learn: 7.7448090	total: 246ms	remaining: 1.44s
8:	learn: 7.6029087	total: 280ms	remaining: 1.43s
9:	learn: 7.4692694	total: 309ms	remaining: 1.39s
10:	learn: 7.3361548	total: 334ms	remaining: 1.34s
11:	learn: 7.2204590	total: 363ms	remaining: 1.3s
12:	learn: 7.0964337	total: 399ms	remaining: 1.29s
13:	learn: 6.9744830	total: 418ms	remaining: 1.23s
14:	learn: 6.8694017	total: 445ms	remainin

OSError: ignored