# Baseline

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

## Загрузка данных

In [None]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

In [None]:
train_df.shape, test_df.shape

In [None]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

Обозначение категориальных признаков

In [None]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

Создаем выборки для валидации и обучения

In [None]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [None]:
x_train

## Обучение модели

В качестве базовой модели возьмем LGBM обучим на всех признаках

In [None]:
model = LGBMClassifier(verbosity=-1, random_state=42, n_jobs=-1)
model.fit(x_train, y_train)

Зададим функцию для взвешенной метрики roc auc

In [None]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [None]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

Проверка работы модели

In [None]:
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape

In [None]:
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

## Прогноз на тестовой выборке

In [None]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(3)

Для того, чтобы сделать прогноз на тестовой выборке, нужно заполнить стартовый кластер. </br>
В качестве базового подхода заполним все стартовые кластеры, самым популярным кластером.

In [None]:
test_df["start_cluster"] = train_df["start_cluster"].mode()[0]
test_df["start_cluster"] = test_df["start_cluster"].astype("category")

In [None]:
sample_submission_df = pd.read_csv("sample_submission.csv")

In [None]:
sample_submission_df.shape

In [None]:
sample_submission_df.head()

In [None]:
sample_submission_df.shape

Для тестовой выборки будем использовать только последний месяц

In [None]:
last_m_test_df = test_df[test_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [None]:
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [None]:
test_pred_proba_df.shape

In [None]:
test_pred_proba_df.head(2)

In [None]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("baseline_submission.csv", index=False)