In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier, CatBoostRegressor

In [62]:
path = '/kaggle/input/rutube-dataset/' # Ваш путь до директории с данными /path/to/data/
data = pd.read_csv(path + 'train_events.csv')
video = pd.read_csv(path + 'video_info_v2.csv')
targets = pd.read_csv(path + 'train_targets.csv')

In [63]:
path_video = '/kaggle/input/video-info-sentence/video_info_sentence_trans.parquet'
video2 = pd.read_parquet(path_video)

In [64]:
path_video = '/kaggle/input/video-sentence-max/video_embed_cols.csv'
video_m = pd.read_csv(path_video)

In [65]:
video_m = video_m.drop(video_m.columns[0], axis=1)

In [66]:
columns = video_m.columns.tolist()

# Переименовываем первые 256 столбцов
new_columns = [f'embedding_{i}' if i < 256 else col for i, col in enumerate(columns)]

# Применяем новые имена к DataFrame
video_m.columns = new_columns

In [67]:
def reduce_embeddings_with_pca(df, n_components=50):
    """
    Уменьшает размерность эмбеддингов в DataFrame с помощью PCA.
    
    :param df: DataFrame, содержащий эмбеддинги.
    :param n_components: Количество компонент для уменьшенной размерности.
    :return: DataFrame с уменьшенными эмбеддингами.
    """
    # Выделяем колонки с эмбеддингами
    embedding_columns = [col for col in df.columns if col.startswith('embedding_')]
    
    # Извлекаем эмбеддинги
    embeddings = df[embedding_columns].values
    
    # Применяем PCA
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Создаем новый DataFrame с уменьшенными эмбеддингами
    reduced_df = pd.DataFrame(reduced_embeddings, columns=[f'reduced_embedding_{i}' for i in range(n_components)])
    
    # Объединяем оригинальный DataFrame с уменьшенными эмбеддингами
    df_with_reduced_embeddings = pd.concat([df.drop(columns=embedding_columns), reduced_df], axis=1)
    
    return df_with_reduced_embeddings

In [68]:
region_time_offsets = {
    'Adygeya Republic': 3,
    'Altai': 7,
    'Altay Kray': 7,
    'Amur Oblast': 9,
    'Arkhangelsk Oblast': 3,
    'Arkhangelskaya': 3,
    'Astrakhan': 4,
    'Astrakhan Oblast': 4,
    'Bashkortostan Republic': 5,
    'Belgorod Oblast': 3,
    'Bryansk Oblast': 3,
    'Buryatiya Republic': 8,
    'Chechnya': 3,
    'Chelyabinsk': 5,
    'Chukotka': 12,
    'Chuvashia': 3,
    'Crimea': 3,
    'Dagestan': 3,
    'Ingushetiya Republic': 3,
    'Irkutsk Oblast': 8,
    'Ivanovo': 3,
    'Ivanovo Oblast': 3,
    'Jaroslavl': 3,
    'Jewish Autonomous Oblast': 10,
    'Kabardino-Balkariya Republic': 3,
    'Kaliningrad': 2,
    'Kaliningrad Oblast': 2,
    'Kalmykiya Republic': 4,
    'Kaluga': 3,
    'Kaluga Oblast': 3,
    'Kamchatka': 12,
    'Karachayevo-Cherkesiya Republic': 3,
    'Karelia': 3,
    'Kemerovo Oblast': 7,
    'Khabarovsk': 10,
    'Khakasiya Republic': 7,
    'Khanty-Mansia': 5,
    'Kirov': 3,
    'Kirov Oblast': 3,
    'Komi': 3,
    'Kostroma Oblast': 3,
    'Krasnodar Krai': 3,
    'Krasnodarskiy': 3,
    'Krasnoyarsk Krai': 7,
    'Krasnoyarskiy': 7,
    'Kurgan Oblast': 5,
    'Kursk': 3,
    'Kursk Oblast': 3,
    'Kuzbass': 7,
    "Leningradskaya Oblast'": 3,
    'Lipetsk Oblast': 3,
    'Magadan Oblast': 11,
    'Mariy-El Republic': 3,
    'Mordoviya Republic': 3,
    'Moscow': 3,
    'Moscow Oblast': 3,
    'Murmansk': 3,
    'Nenets': 3,
    'Nizhny Novgorod Oblast': 3,
    'North Ossetia': 3,
    'North Ossetia–Alania': 3,
    'Novgorod Oblast': 3,
    'Novosibirsk Oblast': 7,
    'Omsk': 6,
    'Omsk Oblast': 6,
    'Orel Oblast': 3,
    'Orenburg Oblast': 5,
    'Oryol oblast': 3,
    'Penza': 3,
    'Penza Oblast': 3,
    'Perm': 5,
    'Perm Krai': 5,
    'Primorskiy (Maritime) Kray': 10,
    'Primorye': 10,
    'Pskov Oblast': 3,
    'Rostov': 3,
    'Ryazan Oblast': 3,
    'Sakha': 9,
    'Sakhalin Oblast': 11,
    'Samara Oblast': 4,
    'Saratov Oblast': 4,
    'Saratovskaya Oblast': 4,
    'Sebastopol City': 3,
    'Smolensk': 3,
    'Smolensk Oblast': 3,
    'Smolenskaya Oblast’': 3,
    'St.-Petersburg': 3,
    'Stavropol Krai': 3,
    'Stavropol Kray': 3,
    'Stavropol’ Kray': 3,
    'Sverdlovsk': 5,
    'Sverdlovsk Oblast': 5,
    'Tambov': 3,
    'Tambov Oblast': 3,
    'Tatarstan Republic': 3,
    'Tomsk Oblast': 7,
    'Transbaikal Territory': 9,
    'Tula': 3,
    'Tula Oblast': 3,
    'Tver Oblast': 3,
    'Tver’ Oblast': 3,
    'Tyumen Oblast': 5,
    'Tyumen’ Oblast': 5,
    'Tyva Republic': 7,
    'Udmurtiya Republic': 4,
    'Ulyanovsk': 4,
    'Vladimir': 3,
    'Vladimir Oblast': 3,
    'Volgograd Oblast': 4,
    'Vologda': 3,
    'Vologda Oblast': 3,
    'Voronezh Oblast': 3,
    'Voronezj': 3,
    'Yamalo-Nenets': 5,
    'Yaroslavl Oblast': 3,
    'Zabaykalskiy (Transbaikal) Kray': 9
}

def calculate_real_timestamp(df):
    # Получаем смещение для каждого региона
    df['region_offset'] = df['region'].map(region_time_offsets).fillna(3)  # По умолчанию UTC+3 (Москва)
    
    # Преобразуем event_timestamp в формат datetime (если еще не сделано)
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'])
    
    # Вычисляем реальное время, добавляя разницу между UTC+3 и временем региона
    df['real_timestamp'] = df['event_timestamp'] + pd.to_timedelta(df['region_offset'] - 3, unit='h')
        
    df = df.drop('region_offset', axis=1)
    
    return df

In [69]:
class CategoryReducer:
    def __init__(self, config):
        self.config = config
        self.cat_values = {}

    def fit(self, data):
        """
        Определяет топовые значения для каждой категории на основе конфигурации.
        """
        top_config = self.config['top_config']
        for column, take in top_config.items():
            # Берем топовые значения для каждого столбца
            top_values = data[column].value_counts().nlargest(take).index.to_list()
            self.cat_values[column] = top_values

    def transform(self, data):
        """
        Заменяет редкие значения категорий на 'another_value'.
        """
        data_copy = data.copy()
        for column, top_values in self.cat_values.items():
            data_copy[column] = data_copy[column].apply(lambda x: x if x in top_values else 'another_value')
        return data_copy

In [70]:
class TargetEncoder:
    def __init__(self, config):
        feature_list = config['target_enc']
        self.feature_list = feature_list
        self.feature_encoding = {}  # Для хранения среднего возраста и вероятности 'male' для каждой фичи
    
    def fit(self, feature_df, target_df):
        # Объединяем feature_df и target_df по viewer_uid
        merged_df = feature_df.merge(target_df, on='viewer_uid', how='left')
        
        # Для каждой фичи из списка сохраняем средний возраст и вероятность 'male'
        for feature in self.feature_list:
            encoding = {}
            # Считаем средний возраст по значению фичи
            mean_age = merged_df.groupby(feature)['age'].mean()
            # Считаем вероятность 'male' по значению фичи
            prob_male = merged_df.groupby(feature)['sex'].apply(lambda x: (x == 'male').mean())
            
            # Сохраняем результаты
            encoding['mean_age'] = mean_age
            encoding['prob_male'] = prob_male
            
            self.feature_encoding[feature] = encoding

    def transform(self, feature_df):
        # Копируем DataFrame, чтобы не изменять оригинал
        transformed_df = feature_df.copy()
        
        features_result = []

        # Для каждой фичи из списка, добавляем новые столбцы с кодировками
        for feature, encoding in self.feature_encoding.items():
            # Создаем новые столбцы с кодировками
            transformed_df[f'{feature}_mean_age'] = transformed_df[feature].map(encoding['mean_age'])
            transformed_df[f'{feature}_prob_male'] = transformed_df[feature].map(encoding['prob_male'])
            features_result += [f'{feature}_mean_age', f'{feature}_prob_male']
        
        return transformed_df, features_result


In [71]:
def transform_each_cat(data2, feature_name, feature_values, aggregation_method, agg_feature):
    # Проверяем, есть ли значения для обработки
    if len(feature_values) == 0:
        return pd.DataFrame()
        
    result = pd.DataFrame()

    # Считаем общее количество событий (total) один раз для всех категорий
    total_events_per_user = data2.groupby('viewer_uid').size().reset_index(name='total_events')
    
    # Для каждого значения фичи создаем новый столбец
    for value in tqdm(feature_values + ['another_value'], desc=f"Processing {feature_name} values"):
        # Создаем временный столбец, где 1, если категория совпадает, иначе 0
        col_name = f"{feature_name}_{value}"
        if aggregation_method == 'sum':
            col_name += '_sum'
            
        data2[col_name] = (data2[feature_name] == value).astype(int)

        # Агрегируем по viewer_uid в зависимости от метода агрегации
        if aggregation_method == 'count':
            # Считаем количество событий для каждого viewer_uid
            agg_result = data2.groupby('viewer_uid')[col_name].sum().reset_index()
            # Считаем долю от общего количества событий
            agg_result[col_name] = agg_result[col_name] / total_events_per_user['total_events']

        elif aggregation_method == 'sum':
            # Считаем сумму agg_feature для текущего значения
            sum_result = data2[data2[feature_name] == value].groupby('viewer_uid')[agg_feature].sum().reset_index()
            # Считаем сумму agg_feature для всех значений feature_name
            total_sum = data2.groupby('viewer_uid')[agg_feature].sum().reset_index()

            # Объединяем результаты
            agg_result = sum_result.merge(total_sum, on='viewer_uid', suffixes=('', '_total'))
            # Делим на сумму agg_feature для всех значений
            agg_result[col_name] = agg_result[agg_feature] / agg_result[agg_feature + '_total']

            # Оставляем только нужные столбцы
            agg_result = agg_result[['viewer_uid', col_name]]

        else:
            raise ValueError(f"Unknown aggregation method: {aggregation_method}")

        # Объединяем результат с основным DataFrame по viewer_uid
        if result.empty:
            result = agg_result
        else:
            result = result.merge(agg_result, on='viewer_uid', how='outer')

    return result

In [72]:
config = {
    'top_config':
        {
        "region" : 50,
        "ua_device_type" : 2,
        "ua_client_type" : 2,
        "ua_os" : 6,
        "ua_client_name" : 10,
        "category" : 50,
        "author_id" : 1000,
        'rutube_video_id': 1000,
        'view_hour' : 24
        },
    'cat_enc':
    [
        ["region", 'count', 'total_watchtime'],
        ["ua_device_type", 'count', 'total_watchtime'],
        ["ua_client_type", 'count', 'total_watchtime'],
        ["ua_os", 'count', 'total_watchtime'],
        ["ua_client_name", 'count', 'total_watchtime'],
        ["category", 'sum', 'total_watchtime'],
        ["category", 'count', 'total_watchtime'],
        ["view_hour", 'count', 'total_watchtime'],
        ["view_hour", 'sum', 'total_watchtime']
    ],
    'target_enc':
    [
        'rutube_video_id',
        'author_id',
        "region",
        "ua_device_type",
        "ua_client_type",
        "ua_os",
        "ua_client_name",
        "category"
    ]
}

class Encoder:
    def __init__(self, video, config):
        self.config = config
        self.video = video
        self.cat_values = dict()
        self.target_encoder = TargetEncoder(config)
        self.category_reducer = CategoryReducer(config)
        self.target_features = None
    
    
    def pred_transform(self, data):
        merged = data.merge(self.video, on=("rutube_video_id"))
        
        merged = calculate_real_timestamp(merged)
        merged['share_view'] = 1000 * merged['total_watchtime'] / merged['duration']
        # Извлекаем час просмотра (значения от 0 до 23)
        merged['view_hour'] = merged['real_timestamp'].dt.hour

        # Извлекаем время от начала эпохи (Unix-время) в секундах
        merged['seconds_from_epoch'] = merged['real_timestamp'].apply(lambda x: x.timestamp())

        return merged
    
    def fit1(self, merged):
        self.merged = merged
        self.category_reducer.fit(merged)
        self.cat_values = self.category_reducer.cat_values
    
    def transform_1_to_1(self, merged):
        merged = self.category_reducer.transform(merged)
        
        return merged
    
    def fit2(self, data2, targets):
        self.target_encoder.fit(data2, targets)
            
    
    def _transform2(self, data2):
        result, target_features = self.target_encoder.transform(data2)
        self.target_features = target_features
        return result
    
    def transform_many_to_1(self, data2):
        # Обработка категориальных фичей
        # обработка cat_enc
        def update_result(result, new_result):
            if result.empty:
                result = new_result
            else:
                result = result.merge(new_result, on='viewer_uid', how='outer')
            return result
        
        result = pd.DataFrame()
        
        cat_enc = self.config['cat_enc']
        for feature_cat_enc in cat_enc:
            feature_name = feature_cat_enc[0]
            cat_values_feature = self.cat_values[feature_name]
            agg_method = feature_cat_enc[1]
            agg_feature = feature_cat_enc[2]
            new_result = transform_each_cat(data2.copy(), feature_name,cat_values_feature, agg_method, agg_feature)
            result = update_result(result, new_result)
        
        # обработка числовых фичей
        numeric_features = data2.select_dtypes(include=['number']).columns.tolist()
        for feature in ['viewer_uid', 'author_id']:
            if feature in numeric_features:
                numeric_features.remove(feature)
        
        for feature in numeric_features:   
            grouped_df = data2.groupby('viewer_uid').agg(
                {feature: ['mean', 'max', 'min','std', 'sum'],})

            # Сбрасываем индекс и приводим к нормальному виду
            grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
            new_result = grouped_df.reset_index()
            result = update_result(result, new_result)
        
        return result

In [73]:
data_test = pd.read_csv('/kaggle/input/rutube-test-dataset/test_events.csv')

TRAIN_IDS = data['viewer_uid'].unique()
VAL_IDS = data_test['viewer_uid'].unique()

train_events = data[data['viewer_uid'].isin(TRAIN_IDS)]
train_targets = targets[targets['viewer_uid'].isin(TRAIN_IDS)]

val_events = data_test

In [74]:
video_emb = reduce_embeddings_with_pca(video_m, 16)

In [75]:
video_emb = video_emb.merge(video, on='rutube_video_id')

In [76]:
enc = Encoder(video_emb, config)

In [77]:
train_events = enc.pred_transform(train_events)
enc.fit1(train_events)
train_events = enc.transform_1_to_1(train_events)
enc.fit2(train_events, targets)
train_events = enc._transform2(train_events)
train_events = enc.transform_many_to_1(train_events)

Processing region values:   0%|          | 0/51 [00:00<?, ?it/s]

Processing ua_device_type values:   0%|          | 0/3 [00:00<?, ?it/s]

Processing ua_client_type values:   0%|          | 0/3 [00:00<?, ?it/s]

Processing ua_os values:   0%|          | 0/7 [00:00<?, ?it/s]

Processing ua_client_name values:   0%|          | 0/11 [00:00<?, ?it/s]

Processing category values:   0%|          | 0/41 [00:00<?, ?it/s]

Processing category values:   0%|          | 0/41 [00:00<?, ?it/s]

Processing view_hour values:   0%|          | 0/25 [00:00<?, ?it/s]

Processing view_hour values:   0%|          | 0/25 [00:00<?, ?it/s]

In [78]:
val_events = enc.pred_transform(val_events)
val_events = enc.transform_1_to_1(val_events)
val_events = enc._transform2(val_events)
val_events = enc.transform_many_to_1(val_events)

Processing region values:   0%|          | 0/51 [00:00<?, ?it/s]

Processing ua_device_type values:   0%|          | 0/3 [00:00<?, ?it/s]

Processing ua_client_type values:   0%|          | 0/3 [00:00<?, ?it/s]

Processing ua_os values:   0%|          | 0/7 [00:00<?, ?it/s]

Processing ua_client_name values:   0%|          | 0/11 [00:00<?, ?it/s]

Processing category values:   0%|          | 0/41 [00:00<?, ?it/s]

Processing category values:   0%|          | 0/41 [00:00<?, ?it/s]

Processing view_hour values:   0%|          | 0/25 [00:00<?, ?it/s]

Processing view_hour values:   0%|          | 0/25 [00:00<?, ?it/s]

In [79]:
def get_data(events, target=None):
    if target is not None:
        df_merged = events.merge(target[['viewer_uid', 'age', 'age_class', 'sex']], on='viewer_uid')
        X = df_merged.drop(columns=['viewer_uid', 'age', 'age_class', 'sex'])  # Убираем viewer_uid и целевую переменную
        y = df_merged[['age', 'age_class', 'sex']]  # Целевая переменная 'sex'
        return X, y
    
    return events.drop(columns=['viewer_uid']), None

In [80]:
from sklearn.ensemble import RandomForestClassifier

class GigaClass:
    def __init__(self, random_state=42):
        self.random_state = random_state
    
    def load_data(self, X_train, y_train, X_val, y_val):
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
    
    def train(self, **params):
        self.model_sex = RandomForestClassifier(**params, random_state=self.random_state)
        self.model_age = RandomForestClassifier(**params, random_state=self.random_state)
        
        self.model_sex.fit(self.X_train.fillna(-1.0), self.y_train['sex'])
        print('----------------------------------')
        self.model_age.fit(self.X_train.fillna(-1.0), self.y_train['age'])
    
    def plot_sex_importances(self):
        feature_importances = self.model_sex.get_feature_importance()

        # Создаем DataFrame для удобства отображения
        feature_names = X_train.columns  # Имена признаков из вашего DataFrame
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importances
        })

        # Сортируем по значимости
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        # Визуализация значимости признаков
        plt.figure(figsize=(10, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(40))  # Отображаем топ-20 признаков
        plt.title('Top 20 Most Important Features')
        plt.xlabel('Importance')
        plt.ylabel('Features')
        plt.tight_layout()
        plt.show()
    
    def plot_age_importances(self):
        feature_importances = self.model_age.get_feature_importance()

        # Создаем DataFrame для удобства отображения
        feature_names = X_train.columns  # Имена признаков из вашего DataFrame
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_importances
        })

        # Сортируем по значимости
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        # Визуализация значимости признаков
        plt.figure(figsize=(10, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(40))  # Отображаем топ-20 признаков
        plt.title('Top 20 Most Important Features')
        plt.xlabel('Importance')
        plt.ylabel('Features')
        plt.tight_layout()
        plt.show()
        
    def test(self, X_test=None, y_test=None):
        if X_test is None:
            X_test = self.X_val
        if y_test is None:
            y_test = self.y_val
        
        sex_predicts = self.model_sex.predict(X_test)
        age_predicts = self.model_age.predict(X_test)
        
        age_class_bins = [9, 20, 30, 40, 60] # Возрастные категории пользователей, подробнее в файле с описанием данных

        age_pred = pd.DataFrame()
        age_pred['age_class'] = pd.cut(age_predicts, bins=age_class_bins, labels=[0, 1, 2, 3]) 
        
        acc = accuracy_score(sex_predicts, y_test['sex'])
        f1 = f1_score(y_test['age_class'], age_pred['age_class'], average='weighted')
        
        total = 0.3 * acc + 0.7 * f1
        
        print(f'Accuracy: {acc} \n F1: {f1} \n Total: {total}')
        
        return acc, f1, total
    
    def predict(self, X_test=None):
        if X_test is None:
            X_test = self.X_val
        sex_predicts = self.model_sex.predict(X_test.fillna(-1.0))
        age_predicts = self.model_age.predict(X_test.fillna(-1.0))
        
        age_class_bins = [9, 20, 30, 40, 60] # Возрастные категории пользователей, подробнее в файле с описанием данных

        age_pred = pd.DataFrame()
        age_pred['age_class'] = pd.cut(age_predicts, bins=age_class_bins, labels=[0, 1, 2, 3]) 
        
        submission = pd.DataFrame(columns=['viewer_uid', 'sex', 'age_class'])
        submission['viewer_uid'] = val_events['viewer_uid']
        submission['sex'] = sex_predicts
        submission['age_class'] = age_pred['age_class']
        
        return submission

In [81]:
train_events_prepared = train_events
val_events_prepared = val_events

In [82]:
X_train, y_train = get_data(train_events_prepared, train_targets)
X_test, y_test = get_data(val_events_prepared, None)

model = GigaClass()

model.load_data(X_train, y_train, X_test, y_test)
model.train(n_estimators=300, max_depth=20, verbose=2, n_jobs=-1) # 800
submission = model.predict()

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 300
building tree 2 of 300
building tree 3 of 300
building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.3s


building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75 of 300
building tree 76 of 300
building tree 77 of 300
building tree 78 of 300
building tree 79

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.9min


building tree 159 of 300
building tree 160 of 300
building tree 161 of 300
building tree 162 of 300
building tree 163 of 300
building tree 164 of 300
building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300
building tree 188 of 300
building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300
building tree 195 of 300
building tree 196 of 300
building tree 197 of 300
building tree 198 of 300


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


----------------------------------


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 300
building tree 2 of 300
building tree 3 of 300
building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   35.5s


building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75 of 300
building tree 76 of 300
building tree 77 of 300
building tree 78 of 300
building tree 79

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.5min


building tree 160 of 300
building tree 161 of 300
building tree 162 of 300
building tree 163 of 300
building tree 164 of 300
building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300
building tree 188 of 300
building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300
building tree 195 of 300
building tree 196 of 300
building tree 197 of 300
building tree 198 of 300
building tree 199 of 300


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    2.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    3.8s finished


In [95]:
submission.to_csv(
    "/kaggle/working/sumb.csv",
    index=False 
)