In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import seaborn as sns

module_path = (Path().resolve().parent/ "Modules")
sys.path.append(str(module_path))

pd.set_option("display.max_columns", None)
plt.rcParams["font.family"] = "IPAexGothic"

import my_modules, model_tuner, features # 自作モジュール

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../Data/train_data_tmp.csv", encoding="shift-jis")
odds_df = pd.read_csv("../Data/Time_Series_Odds_win_odds.csv", encoding="shift-jis")

In [3]:
df = my_modules.preprocessing(df)
df = my_modules.common_process(df)

  df["place_num"] = df["place"].replace(place_dict).astype(int)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 252411 entries, 252634 to 0
Data columns (total 67 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   race_id              252411 non-null  int64         
 1   year                 252411 non-null  int64         
 2   month                252411 non-null  int64         
 3   day                  252411 non-null  int64         
 4   times                252411 non-null  int64         
 5   place                252411 non-null  object        
 6   daily                252411 non-null  object        
 7   race_num             252411 non-null  int64         
 8   horse                252411 non-null  object        
 9   jockey_id            252411 non-null  object        
 10  trainer_id           252411 non-null  int64         
 11  horse_N              252411 non-null  object        
 12  waku_num             252411 non-null  int64         
 13  horse_num          

### nord2Vecを試してみる

In [57]:
import networkx as nx
from node2vec import Node2Vec      # pip install node2vec

DIM = 32

# --- 1)  血統グラフを作る  ------------------------------------
def build_pedigree_graph(df):
    df = df.copy()
    df = df.drop_duplicates(subset="horse")
    g = nx.DiGraph() # 有向グラフ
    for _, r in df.iterrows():
        h = r["horse"]
        f = r["father"]
        m = r["mother"]
        bm = r["broodmare_sire"]
        g.add_node(h) #ノードを追加
        if pd.notna(f):
            g.add_node(f); g.add_edge(f, h, rel="father") # ノードとエッジ（辺）を追加
        if pd.notna(m):
            g.add_node(m); g.add_edge(m, h, rel="mother")
        if pd.notna(bm):
            g.add_node(bm); g.add_edge(bm, m, rel="bloodmare_sire")
    return g


# --- 2)  Node2vec  -------------------------------------------
#  * directed=True で親→子の有向ウォーク
G = build_pedigree_graph(df)
n2v = Node2Vec(G, dimensions=DIM, walk_length=30,
               num_walks=30, workers=4,
               p=1, q=0.5, weight_key=None)

model = n2v.fit(window=5, min_count=1, batch_words=256)


# --- 3)  DataFrame に埋め込む  -------------------------------
def add_graph_embedding(df, model, horse_col="horse", dim=32):
    vec = np.array([model.wv[h] if h in model.wv
                    else np.zeros(dim) for h in df[horse_col]])
    new_cols = [f"pedigree_g2v_{i}" for i in range(dim)]
    return pd.concat([df.reset_index(drop=True),
                      pd.DataFrame(vec, columns=new_cols)], axis=1), new_cols

df_emb, graph_cols = add_graph_embedding(df, model, dim=DIM)

Computing transition probabilities:   0%|          | 0/46470 [00:00<?, ?it/s]

In [58]:
horse1 = df_emb[df_emb.horse == "ミスビアンカ"].drop_duplicates(subset="horse")[[col for col in df_emb.columns if col.startswith("pedigree_g2v")]].values[0]
horse2 = df_emb[df_emb.horse == "イクイノックス"].drop_duplicates(subset="horse")[[col for col in df_emb.columns if col.startswith("pedigree_g2v")]].values[0]
print(np.dot(horse1, horse2) / (np.linalg.norm(horse1, ord=2) * np.linalg.norm(horse2, ord=2)))

0.1514917
