# インポート

In [5]:
import pandas as pd
from pathlib import Path

COMMON_DATA_DIR = Path("..", "..", "common", "data")
RAWDF_DIR = COMMON_DATA_DIR / "rawdf"

# データ加工

## レース結果テーブルの加工

In [41]:
df = pd.read_csv(RAWDF_DIR / "results.csv", sep="\t")

In [43]:
df.iloc[:, 10:20]

Unnamed: 0,ﾀｲﾑ指数,通過,上り,単勝,人気,馬体重,調教ﾀｲﾑ,厩舎ｺﾒﾝﾄ,備考,調教師
0,**,2-2,34.3,1.2,1.0,452(-4),,,,[東] 鈴木伸尋
1,**,1-1,34.5,4.1,2.0,454(+2),,,,[東] 杉浦宏昭
2,**,5-4,34.5,59.9,6.0,438(-6),,,,[西] 羽月友彦
3,**,3-3,34.9,16.6,3.0,450(+2),,,,[西] 武幸四郎
4,**,8-8,34.5,23.9,5.0,434(-10),,,,[西] 今野貞一
...,...,...,...,...,...,...,...,...,...,...
47667,**,11-11-8-7,38.8,62.8,11.0,394(+2),,,,[西] 松永幹夫
47668,**,5-5-10-11,38.9,125.6,12.0,470(-4),,,,[西] 安田隆行
47669,**,3-3-4-5,39.9,9.8,5.0,496(+8),,,,[西] 森田直行
47670,**,2-2-4-7,40.0,37.2,10.0,440(-2),,,,[西] 大根田裕


In [11]:
df["着順"]

0         1
1         2
2         3
3         4
4         5
         ..
47667     8
47668     9
47669    10
47670    11
47671    12
Name: 着順, Length: 47672, dtype: object

In [16]:
df["rank"] = pd.to_numeric(df["着順"], errors="coerce") # errors="coerce"で変換できない文字を欠損値に変換.
df["rank"].value_counts(dropna=False)

rank
3.0     3460
1.0     3459
2.0     3456
4.0     3456
5.0     3456
6.0     3443
7.0     3424
8.0     3367
9.0     3259
10.0    3101
11.0    2895
12.0    2637
13.0    2342
14.0    2047
15.0    1708
16.0    1279
NaN      399
17.0     280
18.0     204
Name: count, dtype: int64

In [None]:
df.dropna(subset=["rank"], inplace=True) # 欠損値を省く

In [19]:
df["rank"].value_counts(dropna=False)

rank
3.0     3460
1.0     3459
2.0     3456
4.0     3456
5.0     3456
6.0     3443
7.0     3424
8.0     3367
9.0     3259
10.0    3101
11.0    2895
12.0    2637
13.0    2342
14.0    2047
15.0    1708
16.0    1279
17.0     280
18.0     204
Name: count, dtype: int64

In [24]:
df["馬番"].astype(int)

0        5
1        8
2        6
3        4
4        1
        ..
47667    1
47668    7
47669    2
47670    4
47671    5
Name: 馬番, Length: 47273, dtype: int64

In [30]:
df["性齢"].str[0].value_counts()

性齢
牡    25038
牝    19922
セ     2313
Name: count, dtype: int64

ラベルエンコーディング：カテゴリ変数を整数にマッピングするデータ加工手法

In [32]:
sex_mapping = {"牡": 0, "牝": 1, "セ": 2}

In [33]:
df["性齢"].str[0].map(sex_mapping).value_counts()

性齢
0    25038
1    19922
2     2313
Name: count, dtype: int64

In [36]:
df["性齢"].str[1:].astype(int)

0        2
1        2
2        2
3        2
4        2
        ..
47667    3
47668    5
47669    3
47670    4
47671    3
Name: 性齢, Length: 47273, dtype: int64

In [37]:
df["斤量"]

0        55.0
1        55.0
2        55.0
3        55.0
4        55.0
         ... 
47667    53.0
47668    55.0
47669    55.0
47670    55.0
47671    52.0
Name: 斤量, Length: 47273, dtype: float64

In [48]:
df["単勝"] = pd.to_numeric(df["単勝"], errors="coerce")

In [51]:
df["単勝"]

0          1.2
1          4.1
2         59.9
3         16.6
4         23.9
         ...  
47667     62.8
47668    125.6
47669      9.8
47670     37.2
47671      6.2
Name: 単勝, Length: 47672, dtype: float64