In [211]:
import pandas as pd
import re

def mapping(df):
    """
    データフレームを渡すと中身の文字列をダミー変数にマッピングする関数だよ。
    入力input: pandas.DataFrame
    
    出力Output: pandas.DataFrame
    """
    for column in df.columns:
        if column == '性齢':
            sex_values = []
            old_values = []
            sex_mappings = {'牝': 0, '牡': 1, 'セ': 2}
            for seirei in df[column]:
                sex = re.findall(r'.', seirei)[0]
                for key, value in sex_mappings.items():
                    if sex == key:
                        sex_values.append(value)
                old = re.findall(r'\d+', seirei)[0]
                old_values.append(old)
            df['性'] = sex_values
            df['齢'] = old_values
            
        if column == '天気':
            weather_list = []
            for weather in df[column]:
                weather_mappings = {'晴': 0, '曇': 1, '小雨': 2, '小雪': 3, '雨': 4, '雪': 5}
                for key, value in weather_mappings.items():
                    if key == weather:
                        weather_list.append(value)
            df['天気index'] = weather_list
        
        if column == 'レース場':
            field_list = []
            for field in df[column]:
                field_mappings = {'芝': 0, 'ダート': 1}
                for key, value in field_mappings.items():
                    if key == field:
                        field_list.append(value)
            df['レース場index'] = field_list
            
        if column == '場の状態':
            field_condition_list = []
            for field_condition in df[column]:
                condition_mappings = {'良': 0, '稍重': 1, '重': 2, '不良': 3}
                for key, value in condition_mappings.items():
                    if key == field_condition:
                        field_condition_list.append(value)
            df['場の状態index'] = field_condition_list
        
    return df

def jockey_recent_score(jockey_df, year):
    recent_score = {}
    for jockey_id, df in jockey_df.groupby(jockey_df.index):
        recent_runs = []
        recent_victories = []
        recent_tops = []
        runs = []
        first_year = int(df['年度']['年度'].tail(1)[0])
        year_list = []
        for i in range(first_year, year + 1, 1):
            year_list.append(str(i))
            
        for _, row in df.iterrows():
            # 直近3年分の勝率を計算すっぺよ
            if row['年度'][0] in [str(year), str(year - 1), str(year - 2)]:
                recent_runs.append(row['1着'][0] + row['2着'][0] + row['3着'][0] + row['着外'][0])
                recent_victories.append(row['1着'][0] + row['2着'][0] + row['3着'][0])
                recent_tops.append(row['1着'][0])
            
            if row['年度'][0] in year_list:
                runs.append(row['1着'][0] + row['2着'][0] + row['3着'][0] + row['着外'][0])                
                
        try:
            recent_victory_ratio = sum(recent_victories) / sum(recent_runs)
            recent_top_ratio = sum(recent_tops) /sum(recent_runs)
            runs_history = sum(runs)
            
        except ZeroDivisionError:
            continue
        except Exception as e:
            print(e)
            break
        except:
            break
            
        recent_score[jockey_id] = {'単勝': round(recent_top_ratio, 3), '複勝': round(recent_victory_ratio, 3), '経験': runs_history}
    return recent_score

def horse_recent_score(horse_df, year):
    recent_score = {}
    for horse_id, df in horse_df.groupby(horse_df.index):
        

def victory_ratio_to_race_results(race_df, jockey_df, horse_df):
    '''
    データを受け取って、レースデータに直近の勝率（とついでにそれまでの経験レース数）、騎手の年齢を追加する関数だよ〜。
    dfを3つ受け取るから注意しようね！
    上のjockey_recent_score関数を内部で使ってるよ。
    
    Input1: pd.DataFrame
    Input2: pd.DataFrame
    Input3: pd.DataFrame
    
    Output: pd.DataFrame
    '''
    shouritu = {}
    for year, df in race_df.groupby('開催年'):
        jockey_id_list = jockey_recent_score(jockey_df, year).keys()
        jockey_ratio = jockey_recent_score(jockey_df, year)
        tanshou_ratio = []
        fukushou_ratio = []
        run_experiences = []
        jockey_old = []
        for _, row in df.iterrows():
            str_jockey_id = str(row['騎手id']).zfill(5)
            birth_year = (jockey_df.loc[jockey_df.index == str_jockey_id])['生年'][0]
            if str_jockey_id in jockey_id_list:
                tanshou_ratio.append(jockey_ratio[str_jockey_id]['単勝'])
                fukushou_ratio.append(jockey_ratio[str_jockey_id]['複勝'])
                run_experiences.append(jockey_ratio[str_jockey_id]['経験'])
                jockey_old.append(year - birth_year)
#                 print([year, birth_year, year - birth_year])
                
            else:
                tanshou_ratio.append(None)
                fukushou_ratio.append(None)
                run_experiences.append(None)
                jockey_old.append(None)
                
        df['騎手直近単勝率'] = tanshou_ratio
        df['騎手直近複勝率'] = fukushou_ratio
        df['騎手経験値'] = run_experiences
        df['騎手年齢'] = jockey_old
        shouritu[year] = df
    new_df = pd.concat([shouritu[key] for key in shouritu])
    return new_df
    
# 100回以上レースに出た騎手の成績を求めてレースデータに埋め込んでいくよ〜

df = pd.read_pickle('../DATA/race_results_of_5_years.pkl')
jockey_count = df.groupby('騎手id').size()
jockey_count_over_100 = jockey_count.loc[jockey_count >= 100]
new_df = df.loc[df['騎手id'].isin(jockey_count_over_100.index)].copy()
# ここで性齢や天気とかを数値化しておくよ〜
new_df = mapping(new_df)

jockey_df = pd.read_pickle('../DATA/jockey_results_of_5_years.pkl')
race_year_list = []
for index, row in new_df.iterrows():
    race_year = int(re.search(r'\d{4}', index).group())
    race_year_list.append(race_year)
new_df['開催年'] = race_year_list

df_for_learning = victory_ratio_to_race_results(new_df, jockey_df)

judge = []
for _, row in df_for_learning.iterrows():
    try:
        if row['着順'] <= 3:
            judge.append('victory')
        else:
            judge.append("I'm a loser.")
    except TypeError:
        judge.append("I'm a loser.")
        continue
df_for_learning['勝敗'] = judge

# print(df_for_learning['騎手年齢'])
df_for_learning.to_pickle('../DATA/recent_5_race_df_for_learning.pkl')
# 以下でconcatする前のデータフレームに分割できるよ
# new_df = mapping(new_df)
# for race_id, df in new_df.groupby(new_df.index):


In [172]:
import pandas as pd

data = {'名前': ['Alice', 'Bob', 'Charlie'],
        '年齢': [25, 30, 35]}
df = pd.DataFrame(data)
# 新たな列を追加
new_df = df.loc[df['年齢'] > 28].copy()
list_a = []
for index, row in df.iterrows():
    if row['年齢'] > 25:
        list_a.append('んうぇ〜い')
    else:
        list_a.append(None)
df['謎'] = list_a

print(df)

        名前  年齢      謎
0    Alice  25   None
1      Bob  30  んうぇ〜い
2  Charlie  35  んうぇ〜い


In [200]:
import pandas as pd

jockey_df = pd.read_pickle('../DATA/jockey_results_of_5_years.pkl')
print((jockey_df.loc[jockey_df.index == '01115'])['生年'][0])

1988


In [205]:
import pandas as pd

df = pd.read_pickle('../DATA/recent_5_race_df_for_learning.pkl')
print(df['騎手id'])

202001010101    1170
202001010101    5339
202001010101    1032
202001010101    1176
202001010101    1116
                ... 
202410020812    1168
202410020812    1208
202410020812    1216
202410020812    1025
202410020812    1201
Name: 騎手id, Length: 213090, dtype: int64
