In [1]:
#%cd C:/Users/Mathieu/Desktop/Projets/Benter
%cd /home/mathieu/Prose/Mathieu/Benter-Project

/home/mathieu/Prose/Mathieu/Benter-Project


In [2]:
%env ENVIRONMENT=development

env: ENVIRONMENT=development


In [3]:
from models.horse import Horse
from models.runner import Runner
from models.race import Race
from database.setup import create_sqlalchemy_session
import tqdm
import random
import re
from tabulate import tabulate
import os
import json
import datetime as dt
from typing import List
import sqlalchemy as sa
from collections import Counter
from  sqlalchemy.sql.expression import func

from utils.music import parse_music, parse_unibet_music
from utils.logger import setup_logger
from constants import UnibetHorseSex

BATCH_SIZE = 1e5
UNIBET_DATA_PATH = "./data/Unibet"
with create_sqlalchemy_session() as db_session:
    horse=db_session.query(Horse).first()
    
logger = setup_logger(__name__)
    

In [4]:
def is_wrong_age_sequence(ages:List[int])->bool:
    ages = [age for age in ages if age is not None]
    return sorted(ages) != ages

forbidden_sex_transitions = [(UnibetHorseSex.MALE, UnibetHorseSex.FEMALE),
                             (UnibetHorseSex.FEMALE, UnibetHorseSex.MALE), 
                             (UnibetHorseSex.GELDING, UnibetHorseSex.FEMALE), 
                             (UnibetHorseSex.GELDING, UnibetHorseSex.MALE),
                             (UnibetHorseSex.FEMALE, UnibetHorseSex.GELDING)
                            ]

forbidden_sex_transitions_regex = r"|".join([f"{s1.value}{s2.value}" for (s1,s2) in forbidden_sex_transitions])

def is_wrong_sex_sequence(sexes:List[UnibetHorseSex])->bool:
    sexes=[sex.value for sex in sexes if sex !=UnibetHorseSex.UNKNOWN]
    if not sexes:
        return False
    sexes_str ="".join(sexes)
    return bool(re.search(forbidden_sex_transitions_regex, sexes_str))
    

def is_wrong_stake_sequence(stakes:List[int])->bool:
    stakes = [stake for stake in stakes if stake is not None]
    return sorted(stakes) != stakes

        
        

In [5]:
start_id=0

wrong_horse_ids_age=set()
wrong_horse_ids_sex=set()
wrong_horse_ids_stake=set()
wrong_horse_ids_music = set()
n_seen_horses=0
with create_sqlalchemy_session() as db_session:
    horses=db_session.query(Horse).filter(Horse.id>start_id).order_by(Horse.id.asc()).limit(BATCH_SIZE).all()
    for horse in tqdm.tqdm(horses, desc=f'{start_id}', total=BATCH_SIZE):
        if is_wrong_age_sequence(ages=[runner.age for runner in horse.runners]):
            wrong_horse_ids_age.add(horse.id)
            
        if is_wrong_sex_sequence(sexes=[runner.sex for runner in horse.runners]):
            wrong_horse_ids_sex.add(horse.id)
            
        if is_wrong_stake_sequence(stakes=[runner.stakes for runner in horse.runners]):
            wrong_horse_ids_stake.add(horse.id)
            
        #if is_wrong_music_sequence(runners=horse.runners):
        #    wrong_horse_ids_music.add(horse.id)
            
            
    n_seen_horses+=len(horses)
    while len(horses)==BATCH_SIZE:
        start_id=horses[-1].id
        horses=db_session.query(Horse).filter(Horse.id>start_id).order_by(Horse.id.asc()).limit(BATCH_SIZE).all()
        for horse in tqdm.tqdm(horses, desc=f'{start_id}', total=len(horses)):
            if is_wrong_age_sequence(ages=[runner.age for runner in horse.runners]):
                wrong_horse_ids_age.add(horse.id)

            if is_wrong_sex_sequence(sexes=[runner.sex for runner in horse.runners]):
                wrong_horse_ids_sex.add(horse.id)

            if is_wrong_stake_sequence(stakes=[runner.stakes for runner in horse.runners]):
                wrong_horse_ids_stake.add(horse.id)
            
            #if is_wrong_music_sequence(runners=horse.runners):
            #    wrong_horse_ids_music.add(horse.id)
            
        n_seen_horses+=len(horses)

0: 100%|██████████| 100000/100000.0 [01:28<00:00, 1126.78it/s]
100003: 100%|██████████| 100000/100000 [01:31<00:00, 1097.43it/s]
200003: 100%|██████████| 100000/100000 [01:42<00:00, 974.50it/s]
300003: 100%|██████████| 100000/100000 [01:34<00:00, 1062.10it/s]
400003: 100%|██████████| 79993/79993 [00:56<00:00, 1421.50it/s]


In [7]:
n_seen_horses

479993

In [8]:
len(wrong_horse_ids_age)

1195

In [9]:
len(wrong_horse_ids_sex)

331

In [10]:
len(wrong_horse_ids_stake)

29313

In [11]:
len(wrong_horse_ids_music)

0

In [None]:
# Look at 

In [1]:
# Appendix

In [2]:
import logging
from collections import Counter
import datetime as dt
from typing import Optional, Generator
from constants import UNIBET_MIN_DATE
from tqdm import tqdm
import os 
import json

logger = logging.getLogger(__name__)
UNIBET_DATA_PATH = "./data/Unibet"

coats=Counter()
coats_info = Counter()
co_coats=Counter()
def date_countdown_generator(
    start_date: dt.date, end_date: Optional[dt.date]
) -> Generator[dt.date, None, None]:
    end_date = end_date or dt.date.today()
    current_date = start_date
    while current_date <= end_date:
        yield current_date
        current_date += dt.timedelta(days=1)
        

for date in tqdm(
    date_countdown_generator(
        start_date=UNIBET_MIN_DATE,
        end_date=dt.date.today() - dt.timedelta(days=1),
    ),
    total=(dt.date.today() - dt.timedelta(days=1) - UNIBET_MIN_DATE).days,
    unit="days",
):
    if not date.isoformat() in os.listdir(UNIBET_DATA_PATH):
        logger.info("Could not find folder for date: %s", date.isoformat())
        continue
    day_folder_path = os.path.join(UNIBET_DATA_PATH, date.isoformat())
    if "programme.json" not in os.listdir(day_folder_path):
        logger.info(
            "Could not find programme.json for date: %s", date.isoformat()
        )
        continue

    with open(os.path.join(day_folder_path, "programme.json"), "r") as fp:
        programme = json.load(fp=fp)
    if "data" not in programme:
        logger.info("Can not import programme of %s", date.isoformat())
        continue

    horse_shows_dict = programme["data"]
    for horse_show_dict in horse_shows_dict:

        if horse_show_dict.get("races"):

            for race_dict in horse_show_dict["races"]:
                race_path = os.path.join(
                    day_folder_path,
                    f"R{horse_show_dict['rank']}_C" f"{race_dict['rank']}.json",
                )
                with open(race_path, "r") as fp:
                    complete_race_dict = json.load(fp=fp)

                current_race_dict = complete_race_dict
                if complete_race_dict.get("note") == "server error, no json":
                    # Can not use complete_race
                    current_race_dict = race_dict

                for runner_dict in current_race_dict["runners"]:
                    coat=runner_dict["details"].get("coat") if runner_dict.get("details") else None
                    coats[coat]+=1
                    
                    runner_statistics_path = os.path.join(
                                day_folder_path,
                                f"R{horse_show_dict['rank']}_"
                                f"C{race_dict['rank']}_"
                                f"RN{runner_dict['rank']}.json",
                            )
                    with open(runner_statistics_path, "r") as fp:
                        runner_stats = json.load(fp=fp)
                        
                    runner_stats_info: Optional[dict] = runner_stats.get("fiche", {}).get("infos_generales")
                    coat_info=runner_stats_info.get("robe") if runner_stats_info else None
                    coats_info[coat_info]+=1
                    
                    co_coats[(coat, coat_info)]+=1

5660days [16:58,  5.56days/s]                        


In [3]:
coats

Counter({'GR': 6809,
         'B. Fonce': 189960,
         'AL': 12396,
         'B': 33435,
         'BF': 6965,
         'B.': 1385065,
         'Al.': 475192,
         'Noi.P': 3342,
         'Noir': 114033,
         'Al. B': 823,
         'Bb.': 4774,
         'B. Fo': 5158,
         '': 379327,
         'B. Clair': 5836,
         'Al. Fonce': 6189,
         'Noi.Pan': 683,
         'BB': 4370,
         'Gr.': 91924,
         'RO': 292,
         'AB': 986,
         'AC': 48,
         'BC': 128,
         'GF': 291,
         'NO': 438,
         'B. Ce': 22,
         'Al. F': 438,
         None: 3579,
         'Bb. Fonce': 326,
         'Bb. F': 216,
         'Al. Brule': 244,
         'Al. Cuivre': 23,
         'Al. C': 151,
         'Bay': 414,
         'AF': 452,
         'Rouan': 2530,
         'B. Rubican': 36580,
         'Al. Rub.': 47,
         'Brown': 15152,
         'B. Fonc': 20,
         'BM': 79,
         'Al. Cr.Lav': 25,
         'Al': 10,
         'AM': 50,
         '

In [4]:
coats_info

Counter({'GR': 21926,
         'BF': 20836,
         'AL': 34468,
         'B': 109000,
         None: 207618,
         'Al.': 26889,
         'B.': 464628,
         'B. Fo': 13078,
         'B. FONCE': 53073,
         'Noi.P': 8486,
         'Noir': 3410,
         'B. FO': 7343,
         'Al. B': 1984,
         'Bb.': 1829,
         'Bb. F': 996,
         'B. Cl': 582,
         'B. Ru': 55,
         'AL.': 264221,
         'NOI.P': 5957,
         'Al. F': 793,
         'NOI.PAN': 70887,
         'BB': 13196,
         'RO': 1098,
         'AB': 2725,
         'Bai': 54056,
         'AC': 96,
         'NO': 1163,
         'B. Ce': 44,
         'NOIR': 17009,
         'Al. C': 416,
         'B. CLAIR': 1767,
         'BC': 512,
         'AF': 777,
         'GF': 976,
         'Gr': 5251,
         'Al': 18952,
         'B. CR': 725,
         'Al. R': 166,
         '': 905897,
         'Gr.': 202,
         'Rouan': 68,
         'b': 24293,
         'AM': 162,
         'AL. F': 648,
       

In [5]:
co_coats

Counter({('GR', 'GR'): 5958,
         ('B. Fonce', 'BF'): 14024,
         ('AL', 'AL'): 10809,
         ('B', 'B'): 29005,
         ('BF', 'BF'): 6208,
         ('B.', 'B'): 76700,
         ('B', None): 4171,
         ('Al.', 'Al.'): 26601,
         ('B.', 'B.'): 461073,
         ('Al.', 'B.'): 106,
         ('AL', 'B. Fo'): 24,
         ('B. Fonce', 'B. FONCE'): 52374,
         ('Noi.P', 'Noi.P'): 3203,
         ('Noir', 'Noir'): 3275,
         ('B. Fonce', 'B. FO'): 7204,
         ('Al. B', 'Al. B'): 772,
         ('Bb.', 'Bb.'): 1410,
         ('B. Fo', 'B. Fo'): 4873,
         ('Bb.', 'Bb. F'): 46,
         ('B.', None): 92134,
         ('', 'B.'): 2213,
         ('B. Fonce', 'B. Fo'): 7766,
         ('B. Clair', 'B. Cl'): 64,
         ('', 'B. Ru'): 18,
         ('Al.', 'AL.'): 263288,
         ('Noir', 'NOI.P'): 5909,
         ('Noir', 'Noi.P'): 4566,
         ('Al. Fonce', 'Al. F'): 54,
         ('Noi.Pan', 'Noi.P'): 650,
         ('Noir', 'NOI.PAN'): 70597,
         ('BB', 'BB'

In [13]:
list(set(coats.keys()).union(set(coats_info.keys())))

['',
 'Bai C',
 'AC',
 'BAI B',
 'BG',
 'grey-bay',
 'B. Clair',
 'BF',
 'NOIR',
 'B. CR',
 'A.',
 'ROUAN',
 'b/br',
 'AL.MELANGE',
 'B.Foncé',
 'CHOCOLAT',
 'BR',
 'AL.MEL.',
 'bb.',
 'B',
 'Noi.P',
 'AL. BRULE',
 'NO',
 'AR',
 'AL',
 'BAI',
 'AA',
 'AL. B',
 'AF',
 'BAI FONCE',
 'Noi.',
 'GR.',
 'BM',
 'GC',
 'gr',
 'B. Fonce',
 'Al. R',
 'Bay',
 'B.d',
 'Bai P',
 'Palomino',
 'R.',
 'Al. C',
 'B. Cerise',
 'AL. F',
 'dkb/br',
 'grey-blk',
 'BB. F',
 'AU',
 'GREYROAN',
 'CHOCO',
 'Chestnut',
 'Al. B',
 'Noi.Pan',
 ' Al.',
 'br',
 'Brown',
 'AB',
 'AL. CU',
 'B. CL',
 'Louve',
 'SM',
 'bai',
 'AL. P',
 'marron',
 's',
 'GRIS',
 None,
 'G.',
 'N.',
 'Bb. Fonce',
 'Chocolat',
 'Bb.',
 'B. Fo',
 'AM',
 'PALOMINO',
 'Gris',
 'Al. Clair',
 'Bl',
 'Choco',
 'grey-chest',
 'Bai',
 'grey-brown',
 'B.',
 'ro',
 'B. Foncé',
 'BAI CLAIR',
 'B. Ru',
 'Al. Aub.',
 'Al. Rub.',
 'AUB.',
 'b/gr',
 ' B.',
 'Gray',
 'BAYBROWN',
 'brown',
 'GR. F',
 'MO',
 'NOI.P',
 'AL. C',
 'Bf.',
 'RG',
 'BBF',
 'ALZ

In [14]:
import logging
from collections import Counter
import datetime as dt
from typing import Optional, Generator
from constants import UNIBET_MIN_DATE
from tqdm import tqdm
import os 
import json

logger = logging.getLogger(__name__)
UNIBET_DATA_PATH = "./data/Unibet"

stakes_equal = Counter()
different_stakes=[]
def date_countdown_generator(
    start_date: dt.date, end_date: Optional[dt.date]
) -> Generator[dt.date, None, None]:
    end_date = end_date or dt.date.today()
    current_date = start_date
    while current_date <= end_date:
        yield current_date
        current_date += dt.timedelta(days=1)
        

for date in tqdm(
    date_countdown_generator(
        start_date=UNIBET_MIN_DATE,
        end_date=dt.date.today() - dt.timedelta(days=1),
    ),
    total=(dt.date.today() - dt.timedelta(days=1) - UNIBET_MIN_DATE).days,
    unit="days",
):
    if not date.isoformat() in os.listdir(UNIBET_DATA_PATH):
        logger.info("Could not find folder for date: %s", date.isoformat())
        continue
    day_folder_path = os.path.join(UNIBET_DATA_PATH, date.isoformat())
    if "programme.json" not in os.listdir(day_folder_path):
        logger.info(
            "Could not find programme.json for date: %s", date.isoformat()
        )
        continue

    with open(os.path.join(day_folder_path, "programme.json"), "r") as fp:
        programme = json.load(fp=fp)
    if "data" not in programme:
        logger.info("Can not import programme of %s", date.isoformat())
        continue

    horse_shows_dict = programme["data"]
    for horse_show_dict in horse_shows_dict:

        if horse_show_dict.get("races"):

            for race_dict in horse_show_dict["races"]:
                race_path = os.path.join(
                    day_folder_path,
                    f"R{horse_show_dict['rank']}_C" f"{race_dict['rank']}.json",
                )
                with open(race_path, "r") as fp:
                    complete_race_dict = json.load(fp=fp)

                current_race_dict = complete_race_dict
                if complete_race_dict.get("note") == "server error, no json":
                    # Can not use complete_race
                    current_race_dict = race_dict

                for runner_dict in current_race_dict["runners"]:
                    stake=runner_dict["details"].get("stakes") if runner_dict.get("details") else None

                    
                    runner_statistics_path = os.path.join(
                                day_folder_path,
                                f"R{horse_show_dict['rank']}_"
                                f"C{race_dict['rank']}_"
                                f"RN{runner_dict['rank']}.json",
                            )
                    with open(runner_statistics_path, "r") as fp:
                        runner_stats = json.load(fp=fp)
                        
                    runner_stats_info: Optional[dict] = runner_stats.get("fiche", {}).get("infos_generales")
                    stake_info=runner_stats_info.get("gain") if runner_stats_info else None
                    stakes_equal[stake==stake_info]+=1
                    if stake!= stake_info:
                        different_stakes.append((stake, stake_info,date.isoformat(), horse_show_dict['rank'], race_dict['rank'], runner_dict['rank']))

5660days [17:19,  5.45days/s]                        


In [15]:
stakes_equal

Counter({False: 2286852, True: 503495})

In [None]:
different_stakes

In [125]:
import logging
from collections import Counter
import datetime as dt
from typing import Optional, Generator
from constants import UNIBET_MIN_DATE
from tqdm import tqdm
import os 
import json
from scripts.generate_unibet import _extract_name_country

logger = logging.getLogger(__name__)
UNIBET_DATA_PATH = "./data/Unibet"

equal_names = Counter()
different_names=[]
def date_countdown_generator(
    start_date: dt.date, end_date: Optional[dt.date]
) -> Generator[dt.date, None, None]:
    end_date = end_date or dt.date.today()
    current_date = start_date
    while current_date <= end_date:
        yield current_date
        current_date += dt.timedelta(days=1)
        

for date in tqdm(
    date_countdown_generator(
        start_date=UNIBET_MIN_DATE,
        end_date=dt.date.today() - dt.timedelta(days=1),
    ),
    total=(dt.date.today() - dt.timedelta(days=1) - UNIBET_MIN_DATE).days,
    unit="days",
):
    if not date.isoformat() in os.listdir(UNIBET_DATA_PATH):
        logger.info("Could not find folder for date: %s", date.isoformat())
        continue
    day_folder_path = os.path.join(UNIBET_DATA_PATH, date.isoformat())
    if "programme.json" not in os.listdir(day_folder_path):
        logger.info(
            "Could not find programme.json for date: %s", date.isoformat()
        )
        continue

    with open(os.path.join(day_folder_path, "programme.json"), "r") as fp:
        programme = json.load(fp=fp)
    if "data" not in programme:
        logger.info("Can not import programme of %s", date.isoformat())
        continue

    horse_shows_dict = programme["data"]
    for horse_show_dict in horse_shows_dict:

        if horse_show_dict.get("races"):

            for race_dict in horse_show_dict["races"]:
                race_path = os.path.join(
                    day_folder_path,
                    f"R{horse_show_dict['rank']}_C" f"{race_dict['rank']}.json",
                )
                with open(race_path, "r") as fp:
                    complete_race_dict = json.load(fp=fp)

                current_race_dict = complete_race_dict
                if complete_race_dict.get("note") == "server error, no json":
                    # Can not use complete_race
                    current_race_dict = race_dict

                for runner_dict in current_race_dict["runners"]:
                    name=_extract_name_country(runner_dict["name"])[0]

                    
                    runner_statistics_path = os.path.join(
                                day_folder_path,
                                f"R{horse_show_dict['rank']}_"
                                f"C{race_dict['rank']}_"
                                f"RN{runner_dict['rank']}.json",
                            )
                    with open(runner_statistics_path, "r") as fp:
                        runner_stats = json.load(fp=fp)
                        
                    runner_stats_info: Optional[dict] = runner_stats.get("fiche", {}).get("infos_generales")
                    name_info=runner_stats_info.get("nom") if runner_stats_info else None
                    name_info=_extract_name_country(name_info)[0]
                    equal_names[name==name_info]+=1
                    if name!=name_info:
                        different_names.append((name, name_info,date.isoformat(), horse_show_dict['rank'], race_dict['rank'], runner_dict['rank']))

5665days [17:26,  5.41days/s]                        


In [126]:
equal_names

Counter({True: 2564253, False: 226094})

In [128]:
different_names[-5:]

[('IKURO JIEL', None, '2020-11-21', 8, 1, 8),
 ('ECLAIR LUDOIS', None, '2020-11-21', 8, 2, 10),
 ('DIANE DU NORD', None, '2020-11-21', 8, 2, 14),
 ('EPONA DU PASSAGE', None, '2020-11-21', 8, 6, 15),
 ('DESTINO DODVILLE', None, '2020-11-21', 8, 7, 10)]

In [131]:
different_names_not_null_info_name = [dfn for dfn in different_names if dfn[1] is not None]

In [133]:
len(different_names_not_null_info_name)

18426

In [135]:
different_names_not_null_info_name[-20:]

[('SUPER REGALONA', 'WONDER GOLLY', '2020-10-30', 7, 3, 10),
 ('ADRIANKA', 'BABY GOLD', '2020-10-30', 7, 3, 11),
 ('COMO UNA NINA', 'CAPACITA', '2020-10-30', 7, 3, 12),
 ('ESCOLTADORA', 'LARA CROFT', '2020-10-30', 7, 4, 1),
 ('DALE FLORA', 'CONTENIDA', '2020-10-30', 7, 4, 3),
 ('GALICADA STREAPS', 'MANDALA', '2020-10-30', 7, 4, 4),
 ('GRACIAS POR VENIR', 'MILKYWAY', '2020-10-30', 7, 4, 5),
 ('LA FLOR DE ORO', 'QUE COQUETA', '2020-10-30', 7, 4, 6),
 ('EASY CHARITY', 'DICHELLE', '2020-10-30', 7, 4, 7),
 ('WINDSTOÃF', 'WINDSTOÆ', '2020-11-01', 2, 5, 7),
 ('LIEBLING E  D4', 'LIEBLING', '2020-11-04', 5, 8, 5),
 ('GLAÇADA', 'GLAǁDA', '2020-11-08', 6, 5, 8),
 ('SEÃ\x91ORITA ROBERTA', 'SENORITA ROBERTA', '2020-11-13', 8, 8, 5),
 ('MALIKADO', 'EL GRAN CARATA SANT', '2020-11-13', 8, 11, 8),
 ('HÜBSCH TF', 'H܂SCH TF', '2020-11-14', 5, 7, 2),
 ('GLAÇADA', 'GLAǁDA', '2020-11-15', 2, 6, 14),
 ('NADA MÃIS', 'NADA MÉS', '2020-11-15', 3, 6, 2),
 ('JR WALKER E', 'JR WALKER', '2020-11-18', 5, 6, 4),
 ('L

In [138]:
#  ('LA FLOR DE ORO', 'QUE COQUETA', '2020-10-30', 7, 4, 6),

from Levenshtein import distance


In [139]:
distance('LA FLOR DE ORO','QUE COQUETA')

12

In [140]:
distance('HÜBSCH TF', 'H܂SCH TF')

2

In [141]:
distance('JR WALKER E', 'JR WALKER')

2

In [143]:
distance('WINDSTOÃF', 'WINDSTOÆ')

2

In [None]:
from  sqlalchemy.sql.expression import func
from utils.music import parse_unibet_music
from constants import UnibetRaceType

with create_sqlalchemy_session() as db_session:
    runners = db_session.query(Runner).order_by(func.random()).limit(100000).all()    
    res=set()
    for runner in runners:
        try:
            music_events, is_new = parse_unibet_music(current_year = runner.date.year, music=runner.music)
        except Exception as e:
            print(f"ERROR with {e}\n runner {runner.id} {runner.date.year} '{runner.music}', {runner.horse.name}")
            print()
        else:
            if is_new:
                continue
            if is_new is None:
                continue

            for event in music_events:

                if event.race_type ==UnibetRaceType.Unknown:
                    res.add(event.race_type)
                    print(event, runner.music, runner.race.type, runner.unibet_code,)
                    print(runner.race.friendly_URL, runner.horse.name)
                    print()

In [None]:
from  sqlalchemy.sql.expression import func
from utils.music import parse_unibet_music
from constants import UnibetRaceType

with create_sqlalchemy_session() as db_session:
    runners = db_session.query(Runner).order_by(func.random()).limit(100000).all()    
    res=set()
    for runner in runners:
        try:
            music_events, is_new = parse_unibet_music(current_year = runner.date.year, music=runner.music)
        except Exception as e:
            print(f"ERROR with {e}\n runner {runner.id} {runner.date.year} '{runner.music}', {runner.horse.name}")
            print()
        else:
            if is_new:
                continue
            if is_new is None:
                continue

            for event in music_events:
                if event.race_type ==UnibetRaceType.UNKNOWN:
                    res.add(runner.music)
                    print(event, runner.music, runner.race.type, runner.unibet_code,)
                    print(runner.race.friendly_URL, runner.horse.name)
                    print()