In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

# Загрузка данных

В бейзлайне реализована простейшая модель на эго-графе.

Для каждой пары вершин u и v эго-графа попытаемся найти всех общих "друзей" w. Силой связи между вершинами u и v будем считать средную силу связи между ними и общими друзьями.

Поскольку обучение для такой модели не требуется, будем пользоваться только тестовой выборкой.

In [2]:
%%time

test = pd.read_csv("train.csv")
submission = pd.read_csv("submission.csv")


CPU times: total: 11.7 s
Wall time: 14.1 s


In [26]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.938045
3,8,1,24,0.048046
4,8,5,4,0.24438


Проверочные данные - недоступны участникам, используются для подсчёта метрики:

In [29]:
control = pd.read_csv("control.csv")

Таблицы control и submission отличаются только последним столбцом x1. В таблице control в этом столбце содержатся истинные значения связей x1.

In [29]:
control[["ego_id", "u", "v"]].equals(submission[["ego_id", "u", "v"]])

NameError: name 'control' is not defined

Таблица submission отсортирована по возрастанию ego_id, u, v:

In [9]:
submission.equals(submission.sort_values(["ego_id", "u", "v"]))

True

# Модель

In [57]:
corrmatrix = test.corr()
corrmatrix

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
ego_id,1.0,0.001169,0.003036,-0.012366,0.000762,0.000752,0.003607
u,0.001169,1.0,0.216931,-0.062228,0.011462,0.002271,-0.064409
v,0.003036,0.216931,1.0,-0.088513,-0.020211,-0.022287,-0.021533
t,-0.012366,-0.062228,-0.088513,1.0,-0.191695,-0.074891,-0.026805
x1,0.000762,0.011462,-0.020211,-0.191695,1.0,0.681151,0.089145
x2,0.000752,0.002271,-0.022287,-0.074891,0.681151,1.0,0.136583
x3,0.003607,-0.064409,-0.021533,-0.026805,0.089145,0.136583,1.0


Константное предсказание:

In [19]:
%%time

submission_dummy = submission.copy()
submission_dummy["x1"] = np.nanmean(test["x1"].values)

CPU times: total: 172 ms
Wall time: 274 ms


In [21]:
%%time

from tqdm import tqdm


ego_id_list = submission["ego_id"].drop_duplicates().values
for ego_id in tqdm(ego_id_list):
    submission_ego_net = submission[submission["ego_id"] == ego_id]
    test_ego_net = test[test["ego_id"] == ego_id]
    friendship = np.zeros_like(submission_ego_net["x1"].values)
    for i, (u, v) in enumerate(zip(submission_ego_net["u"], submission_ego_net["v"])):
        
        u_x1 = test_ego_net.loc[test_ego_net["u"] == u, ["v", "x1"]].dropna()
        v_x1 = test_ego_net.loc[test_ego_net["u"] == v, ["v", "x1"]].dropna()
        
        common_friends = u_x1.merge(v_x1, on="v")
        
        if common_friends.shape[0] > 0:
            friendship[i] = np.mean(common_friends.drop("v", axis=1).values)
            # friendship[i] = np.sqrt(np.mean(np.square(common_friends.drop("v", axis=1).values)))
    submission.loc[submission["ego_id"] == ego_id, "x1"] = friendship

100%|██████████| 20586/20586 [39:35<00:00,  8.67it/s] 

CPU times: total: 28min 36s
Wall time: 39min 35s





In [23]:
submission.to_csv('result.csv')

# Посчёт метрик

In [51]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [54]:
print("Dummy model RMSE: {}".format(RMSE(control["x1"], submission_dummy["x1"])))

Dummy model RMSE: 1.3604058861047796


In [63]:
print("Baseline model RMSE: {}".format(RMSE(control["x1"], submission["x1"])))

Baseline model RMSE: 1.353040933001075
