In [59]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

# Загрузка данных

В бейзлайне реализована простейшая модель на эго-графе.

Для каждой пары вершин u и v эго-графа попытаемся найти всех общих "друзей" w. Силой связи между вершинами u и v будем считать средную силу связи между ними и общими друзьями.

Поскольку обучение для такой модели не требуется, будем пользоваться только тестовой выборкой.

In [60]:
%%time

test = pd.read_csv("train.csv")
submission = pd.read_csv("submission.csv")


CPU times: total: 33.5 s
Wall time: 45.4 s


In [61]:
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


Проверочные данные - недоступны участникам, используются для подсчёта метрики:

In [29]:
control = pd.read_csv("control.csv")

Таблицы control и submission отличаются только последним столбцом x1. В таблице control в этом столбце содержатся истинные значения связей x1.

In [29]:
control[["ego_id", "u", "v"]].equals(submission[["ego_id", "u", "v"]])

NameError: name 'control' is not defined

Таблица submission отсортирована по возрастанию ego_id, u, v:

In [9]:
submission.equals(submission.sort_values(["ego_id", "u", "v"]))

True

# Модель

In [62]:
corrmatrix = test.corr()
corrmatrix

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
ego_id,1.0,0.001828,0.00184,0.00142,0.000228,-0.001061,-0.000929
u,0.001828,1.0,0.21833,-0.065169,0.011669,0.001498,-0.064033
v,0.00184,0.21833,1.0,-0.092008,-0.019556,-0.022957,-0.022212
t,0.00142,-0.065169,-0.092008,1.0,-0.190351,-0.07375,-0.02547
x1,0.000228,0.011669,-0.019556,-0.190351,1.0,0.67887,0.08936
x2,-0.001061,0.001498,-0.022957,-0.07375,0.67887,1.0,0.136608
x3,-0.000929,-0.064033,-0.022212,-0.02547,0.08936,0.136608,1.0


Константное предсказание:

In [63]:
%%time

submission_dummy = submission.copy()
submission_dummy["x1"] = np.nanmean(test["x1"].values)

CPU times: total: 312 ms
Wall time: 442 ms


In [64]:
%%time

from tqdm import tqdm


ego_id_list = submission["ego_id"].drop_duplicates().values
for ego_id in tqdm(ego_id_list):
    submission_ego_net = submission[submission["ego_id"] == ego_id]
    test_ego_net = test[test["ego_id"] == ego_id]
    friendship = np.zeros_like(submission_ego_net["x1"].values)
    for i, (u, v) in enumerate(zip(submission_ego_net["u"], submission_ego_net["v"])):
        
        u_x1 = test_ego_net.loc[test_ego_net["u"] == u, ["v", "x2"]].dropna()
        v_x1 = test_ego_net.loc[test_ego_net["u"] == v, ["v", "x2"]].dropna()
        
        common_friends = u_x1.merge(v_x1, on="v")
        
        if common_friends.shape[0] > 0:
            friendship[i] = np.mean(common_friends.drop("v", axis=1).values)
            # friendship[i] = np.sqrt(np.mean(np.square(common_friends.drop("v", axis=1).values)))
    submission.loc[submission["ego_id"] == ego_id, "x1"] = friendship

 10%|▉         | 2057/20586 [04:37<41:41,  7.41it/s]  


KeyboardInterrupt: 

In [23]:
submission.to_csv('result.csv')

# Посчёт метрик

In [51]:
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [54]:
print("Dummy model RMSE: {}".format(RMSE(control["x1"], submission_dummy["x1"])))

Dummy model RMSE: 1.3604058861047796


In [63]:
print("Baseline model RMSE: {}".format(RMSE(control["x1"], submission["x1"])))

Baseline model RMSE: 1.353040933001075
