In [1]:
from datetime import datetime,  timedelta
import os
import re
import time
import pandas as pd
import numpy as np
from multiprocessing import Process, Manager, current_process

CATCH_DAYS = 14

In [131]:
#funcs
def generate_dates(
                    time_target: str
    ) -> list[datetime]:
    """
    створення сортированого списку дат csv-файлів з папки -  "past" або "future"
    """
    a = [
        datetime(
            year=int(f"20{file_name.split('_')[2]}"),
            month=int(file_name.split('_')[1]),
            day=int(file_name.split('_')[0]),
          )#.strftime("%d_%m_%y")
        for file_name in os.listdir(f'csv/{time_target}/')
    ]
    a.sort()
    return a


def filter_from_tuesday(
                        target_day: datetime, 
                        list_days: list[datetime],
                        target_df: bool  = False,
    ) -> list[datetime] :
    """
    отримання списку дат, з вівторка по понеділок, включно
    результат = список з CATCH_DAYS дат
    """
    count_dates = 7 if target_df is True else CATCH_DAYS
    return list(filter(
        lambda x: target_day <= x < target_day + timedelta(days=count_dates),
        list_days
    ))


def create_df_w_date(
                csv_name: str, 
                time_target: str
    ) -> pd.DataFrame :
    """
    створення датафрейму з колонкою дати 
    """
    df = pd.read_csv(f'csv/{time_target}/{csv_name}')
    df['date'] = csv_name[:8]
    return df


def get_final_df(
                time_target: str,
                day_target: datetime,
                target_df: bool  = False,
    ) -> pd.DataFrame :
    """
    """
    list_days = generate_dates(time_target)
    filtred_list_days = filter_from_tuesday(day_target, list_days, target_df)

    file_ends = "_p.csv" if time_target == 'past' else "_f.csv"

    return pd.concat(
        [create_df_w_date(f"{days.strftime('%d_%m_%y')}{file_ends}", time_target) for days in filtred_list_days],
        ignore_index=True
    ) 


def create_np_from_df(
                    df: pd.DataFrame,
                    time_target: str='past',
    ) -> np.ndarray:
    """
    - добавлення необхідних колонок до датафрейму
    - створення масиву даних готових для обробки
    """
    def add_column(
                    data: pd.Series, 
                    cur_team: pd.Series, 
                    target_team: pd.Series,
        ) -> list[int]:
        rez = []
        for index in data.index:
            try:
                z_h, p_h = data[index].split('-')
                z_h, p_h = (int(z_h), int(p_h)) if cur_team[index] == target_team[index] else (int(p_h), int(z_h))

                rez.append(z_h - p_h) 
            except ValueError:
                rez.append(-999) 
        return rez
    
    def add_rez_column(
                    data: pd.Series, 
                    cur_team: pd.Series, 
                    target_team: pd.Series,
        ) -> list[int]:
        """
        створення колонки тотала більше 1.5 
        коланка буде містити  - 1 або 0 або None
        """
        rez = []
        for index in data.index:
            try:
                z_h, p_h = data[index].split('-')
                z_h, p_h = (int(z_h), int(p_h)) if cur_team[index] == target_team[index] else (int(p_h), int(z_h))

                rez.append(
                    int(abs(z_h + p_h) > 1.5)
                ) 
            except ValueError:
                rez.append(None) 
        return rez


    df['rez_h_f_half'] = add_column(
                                    data=df['h_match_f_half'], 
                                    cur_team=df['cur_match_h_team'], 
                                    target_team=df['h_match_h_team']
                                )
    df['rez_h_match'] = add_column(
                                    data=df['h_match_score'], 
                                    cur_team=df['cur_match_h_team'], 
                                    target_team=df['h_match_h_team']
                                )

    df['rez_a_f_half'] = add_column(
                                    data=df['a_match_f_half'], 
                                    cur_team=df['cur_match_a_team'], 
                                    target_team=df['a_match_h_team']
                                )
    df['rez_a_match'] = add_column(
                                    data=df['a_match_score'], 
                                    cur_team=df['cur_match_a_team'], 
                                    target_team=df['a_match_h_team']
                                )
    df['index'] = df.index
    if time_target == 'past':
        df['rez_c_f_half'] = add_rez_column(
                                        data=df['cur_match_f_half'], 
                                        cur_team=df['cur_match_h_team'], 
                                        target_team=df['cur_match_h_team']
                                    )
        df['rez_c_match'] = add_rez_column(
                                        data=df['cur_match_score'], 
                                        cur_team=df['cur_match_h_team'], 
                                        target_team=df['cur_match_h_team']
                                    )
        df.dropna(inplace=True)
        return  df[[
                    'rez_h_f_half','rez_h_match',
                    'rez_a_f_half', 'rez_a_match',
                    'rez_c_f_half', 'rez_c_match',
                    'index'
                ]].to_numpy()
    
    df.dropna(inplace=True)
    return  df[[
                'rez_h_f_half','rez_h_match',
                'rez_a_f_half', 'rez_a_match',
                'index'
            ]].to_numpy()


def predict_past(
            df_np: np.ndarray,
            proc: float,
            count_games: int,
            verbose=True
    ) -> list[list[int]]:
    """
    """
    lists = range(-5,5+1)
    total_rez = []
    total_matches = 0
    
    for h_f in lists:
        for h_score in lists:
            for a_f in lists:
                for a_score in lists:
                    
                    rez = df_np[ 
                        (df_np[:,0] == h_f) &
                        (df_np[:,1] == h_score) &
                        (df_np[:,2] == a_f) &
                        (df_np[:,3] == a_score) 
                    ]
                    if rez.shape[0] == 0:
                        continue
                        
                    r_h_f_tb = rez[rez[:,4] == 1].shape[0]
                    r_h_f_tm = rez[rez[:,4] == 0].shape[0] 
                    proc_f_tb = r_h_f_tb / (r_h_f_tb + r_h_f_tm) if (r_h_f_tb + r_h_f_tm) > 0 else 1.0


                    # r_h_score_tb = rez[rez[:,5] == 1].shape[0]
                    # r_h_score_tm = rez[rez[:,5] == 0].shape[0] 
                    # proc_score = r_h_score_tb/(r_h_score_tb+r_h_score_tm)
                    # if proc_f_tb >= proc and (r_h_f_tb + r_h_f_tm) >= count_games:
                    if proc_f_tb >= proc and (r_h_f_tb + r_h_f_tm) >= count_games:
                    # if proc_score >= proc and (r_h_score+r_a_score) >= count_games:
                    # if proc_score <= proc and (r_h_score+r_a_score) >= count_games:
                        total_rez.append([h_f, h_score, a_f, a_score])
                        total_matches += rez.shape[0]

                        if verbose is True:
                            print(
                                f'{h_f:2d} {h_score:2d} {a_f:2d} {a_score:2d}'
                                f' ||| f_h =>{r_h_f_tb:3d} {r_h_f_tm:3d} = {proc_f_tb:.3f}'
                                # f' ||| score{r_h_score:3d} {r_a_score:3d}= {proc_score:.3f}'
                            )

    return total_rez, total_matches



def check_predict(
                df_np: np.ndarray,
                list_rez: list[list[int]],
                time_target: str='past'
    ) -> dict[str, int]:
    total_rez = {
                'tb': 0,
                'tm': 0,
                'total_games': 0,
                'indexes': []
    }
    for h_f, h_score, a_f, a_score in list_rez:
        r = df_np[
                (df_np[:,0] == h_f) &
                (df_np[:,1] == h_score) &
                (df_np[:,2] == a_f) &
                (df_np[:,3] == a_score) 
            ]
        if time_target == 'future':
            print(f'{r[:, 4]}',h_f, h_score, a_f, a_score)
            total_rez['indexes'] = [*total_rez['indexes'],*r[:, 4]]
            continue
            
        r_h_f_tb = r[r[:,4] == 1] 
        r_h_f_tm = r[r[:,4] == 0] 
        
 
        total_rez['tb'] += r_h_f_tb.shape[0]
        total_rez['tm'] += r_h_f_tm.shape[0]
        total_rez['total_games'] += r.shape[0]
        total_rez['indexes'] = [*total_rez['indexes'],*r[:, 6]]
        
#         print(f'{r[:, 6]}',h_f, h_score, a_f, a_score, '||', r_h_f_tb.shape[0], r.shape[0])
#         print(h_f, h_score, a_f, a_score, '||', r_h_f_tb.shape[0], r_h_f_tm.shape[0])
    return total_rez     

In [154]:
## отримання датафреймів(пачка з 7 днів)   для обробки, 
## df_work - для предсказання, 
## df_target - для перевірки 

time_target = 'past' 
# time_target = 'future' 
day_target = datetime(year=2023, month=4, day=11)

df_work = get_final_df(
                time_target=time_target, 
                day_target=day_target
            )
df_target = get_final_df(
                time_target=time_target, 
                day_target=day_target+timedelta(days=CATCH_DAYS),
                target_df=True
            )
# df_work
# df_target

In [155]:
## отримання numpy.ndarray готових до обробки 
## df_np_work - для предсказання, 
## df_np_target - для перевірки 

df_np_work = create_np_from_df(df_work)
df_np_target = create_np_from_df(df_target)

In [156]:
# отримання найкращих стат
for proc_work in np.arange(0.51, 0.9, 0.01):
    for tot_work in np.arange(2, 30, 1):
        
        rez, tot_matches = predict_past(df_np_work, proc_work, tot_work, verbose=False)
        print(proc_work, tot_work, tot_matches)
        continue
        
        if tot_matches >= 50: continue
            
        a = check_predict(df_np_target, rez) 
        
        if a['total_games'] == 0: continue
            
        tb = a['tb'] / a['total_games']
        tm = a['tm'] / a['total_games']
        if tb >= 0.5:
            
            print(
                f'matches = {tot_matches} '
                f'proc_work={proc_work:.3f}, tot_work={tot_work} => '
#                 f'tb {tb:.3f}  || '
                f'tb {tb:.3f} ',
                f'tot_target= {a["total_games"]} \n',

            )

0.51 2 590
0.51 3 530
0.51 4 425
0.51 5 401
0.51 6 321
0.51 7 303
0.51 8 268
0.51 9 220
0.51 10 193
0.51 11 153
0.51 12 131
0.51 13 131
0.51 14 92
0.51 15 78
0.51 16 63
0.51 17 63
0.51 18 63
0.51 19 63
0.51 20 44
0.51 21 44
0.51 22 23
0.51 23 23
0.51 24 0
0.51 25 0
0.51 26 0
0.51 27 0
0.51 28 0
0.51 29 0
0.52 2 590
0.52 3 530
0.52 4 425
0.52 5 401
0.52 6 321
0.52 7 303
0.52 8 268
0.52 9 220
0.52 10 193
0.52 11 153
0.52 12 131
0.52 13 131
0.52 14 92
0.52 15 78
0.52 16 63
0.52 17 63
0.52 18 63
0.52 19 63
0.52 20 44
0.52 21 44
0.52 22 23
0.52 23 23
0.52 24 0
0.52 25 0
0.52 26 0
0.52 27 0
0.52 28 0
0.52 29 0
0.53 2 550
0.53 3 490
0.53 4 385
0.53 5 361
0.53 6 281
0.53 7 263
0.53 8 228
0.53 9 180
0.53 10 153
0.53 11 113
0.53 12 91
0.53 13 91
0.53 14 52
0.53 15 38
0.53 16 23
0.53 17 23
0.53 18 23
0.53 19 23
0.53 20 23
0.53 21 23
0.53 22 23
0.53 23 23
0.53 24 0
0.53 25 0
0.53 26 0
0.53 27 0
0.53 28 0
0.53 29 0
0.54 2 496
0.54 3 436
0.54 4 331
0.54 5 307
0.54 6 227
0.54 7 209
0.54 8 174
0.54 9 

0.6600000000000001 11 0
0.6600000000000001 12 0
0.6600000000000001 13 0
0.6600000000000001 14 0
0.6600000000000001 15 0
0.6600000000000001 16 0
0.6600000000000001 17 0
0.6600000000000001 18 0
0.6600000000000001 19 0
0.6600000000000001 20 0
0.6600000000000001 21 0
0.6600000000000001 22 0
0.6600000000000001 23 0
0.6600000000000001 24 0
0.6600000000000001 25 0
0.6600000000000001 26 0
0.6600000000000001 27 0
0.6600000000000001 28 0
0.6600000000000001 29 0
0.6700000000000002 2 176
0.6700000000000002 3 116
0.6700000000000002 4 98
0.6700000000000002 5 74
0.6700000000000002 6 54
0.6700000000000002 7 48
0.6700000000000002 8 34
0.6700000000000002 9 10
0.6700000000000002 10 10
0.6700000000000002 11 0
0.6700000000000002 12 0
0.6700000000000002 13 0
0.6700000000000002 14 0
0.6700000000000002 15 0
0.6700000000000002 16 0
0.6700000000000002 17 0
0.6700000000000002 18 0
0.6700000000000002 19 0
0.6700000000000002 20 0
0.6700000000000002 21 0
0.6700000000000002 22 0
0.6700000000000002 23 0
0.67000000000

0.7800000000000002 17 0
0.7800000000000002 18 0
0.7800000000000002 19 0
0.7800000000000002 20 0
0.7800000000000002 21 0
0.7800000000000002 22 0
0.7800000000000002 23 0
0.7800000000000002 24 0
0.7800000000000002 25 0
0.7800000000000002 26 0
0.7800000000000002 27 0
0.7800000000000002 28 0
0.7800000000000002 29 0
0.7900000000000003 2 108
0.7900000000000003 3 48
0.7900000000000003 4 30
0.7900000000000003 5 26
0.7900000000000003 6 6
0.7900000000000003 7 0
0.7900000000000003 8 0
0.7900000000000003 9 0
0.7900000000000003 10 0
0.7900000000000003 11 0
0.7900000000000003 12 0
0.7900000000000003 13 0
0.7900000000000003 14 0
0.7900000000000003 15 0
0.7900000000000003 16 0
0.7900000000000003 17 0
0.7900000000000003 18 0
0.7900000000000003 19 0
0.7900000000000003 20 0
0.7900000000000003 21 0
0.7900000000000003 22 0
0.7900000000000003 23 0
0.7900000000000003 24 0
0.7900000000000003 25 0
0.7900000000000003 26 0
0.7900000000000003 27 0
0.7900000000000003 28 0
0.7900000000000003 29 0
0.8000000000000003 

In [164]:
## отримання  предсказання
# rez, tot_matches = predict_past(df_np_work, 0.83, 3)  # перевірено
rez, tot_matches = predict_past(df_np_work, 0.83, 3)  # тест
print(f'загальна кількість матчів = {tot_matches}')
rez

-3 -4  1  2 ||| f_h =>  3   0 = 1.000
-2 -4  2  2 ||| f_h =>  3   0 = 1.000
-2 -2  1 -1 ||| f_h =>  3   0 = 1.000
-1  1  0  2 ||| f_h =>  3   0 = 1.000
 0 -1 -1  2 ||| f_h =>  4   0 = 1.000
 0 -1  1  4 ||| f_h =>  3   0 = 1.000
 0  0  1  4 ||| f_h =>  5   1 = 0.833
 2  2  1  3 ||| f_h =>  5   0 = 1.000
 3  4  1  0 ||| f_h =>  3   0 = 1.000
загальна кількість матчів = 33


[[-3, -4, 1, 2],
 [-2, -4, 2, 2],
 [-2, -2, 1, -1],
 [-1, 1, 0, 2],
 [0, -1, -1, 2],
 [0, -1, 1, 4],
 [0, 0, 1, 4],
 [2, 2, 1, 3],
 [3, 4, 1, 0]]

In [165]:
a = check_predict(df_np_target, rez)  
if a['total_games'] > 0:
    tb = a['tb'] / a['total_games']
    tm = a['tm'] / a['total_games']
    print(
        a, '\n', 
        f'tb {tb:.3f}  || '
        f'tm {tm:.3f} \n',

    )

{'tb': 0, 'tm': 1, 'total_games': 1, 'indexes': [44]} 
 tb 0.000  || tm 1.000 



In [159]:
# df_target.loc[a['indexes']]['rez_c_f_half'].value_counts()
df_target.loc[a['indexes']] 

Unnamed: 0,cur_match_h_team,cur_match_a_team,cur_match_score,cur_match_f_half,cur_match_s_half,h_match_h_team,h_match_a_team,h_match_score,h_match_f_half,h_match_s_half,...,a_match_f_half,a_match_s_half,date,rez_h_f_half,rez_h_match,rez_a_f_half,rez_a_match,index,rez_c_f_half,rez_c_match
44,Манчестер Юнайтед U18,Вулверхэмптон U18,2-2,1 - 0,1 - 2,Мидлсбро U18,Манчестер Юнайтед U18,0-2,0 - 2,0 - 0,...,3 - 2,2 - 0,25_04_23,2,2,1,3,44,0,1


In [232]:
# df_work[
#         (df_work['rez_h_f_half'] == -1) &
#         (df_work['rez_h_match']  == -3) &
#         (df_work['rez_a_f_half'] == -1) &
#         (df_work['rez_a_match']  == -1) 
#     ]

df_target[
        (df_target['rez_h_f_half'] == -1) &
        (df_target['rez_h_match']  == -3) &
        (df_target['rez_a_f_half'] == -1) &
        (df_target['rez_a_match']  == -1) 
    ]

Unnamed: 0,cur_match_h_team,cur_match_a_team,cur_match_score,cur_match_f_half,cur_match_s_half,h_match_h_team,h_match_a_team,h_match_score,h_match_f_half,h_match_s_half,...,a_match_f_half,a_match_s_half,date,rez_h_f_half,rez_h_match,rez_a_f_half,rez_a_match,index,rez_c_f_half,rez_c_match
533,Витоша Бистрица,Спартак Плевен,2-2,2 - 2,0 - 0,Струмска Слава,Витоша Бистрица,3-0,1 - 0,2 - 0,...,0 - 1,0 - 0,14_04_23,-1,-3,-1,-1,533,1,1
883,АСМ Оран,КР Темушен,0-2,0 - 0,0 - 2,Бен Акнун,АСМ Оран,4-1,1 - 0,3 - 1,...,0 - 1,1 - 1,15_04_23,-1,-3,-1,-1,883,0,1
1689,Денизлиспор,Кечиоренгюджю,0-4,0 - 2,0 - 2,Тузласпор,Денизлиспор,3-0,1 - 0,2 - 0,...,0 - 1,2 - 2,15_04_23,-1,-3,-1,-1,1689,1,1
1882,Блэк Старз,Кёниц,2-1,2 - 0,0 - 1,Тун II,Блэк Старз,5-2,1 - 0,4 - 2,...,0 - 1,0 - 0,15_04_23,-1,-3,-1,-1,1882,1,1
2475,Giugliano,Потенца,3-2,2 - 2,1 - 0,Фоджа,Giugliano,3-0,1 - 0,2 - 0,...,0 - 1,1 - 1,16_04_23,-1,-3,-1,-1,2475,1,1
2945,Мутенице,Ратишковице,2-1,1 - 0,1 - 1,Kunstat,Мутенице,4-1,2 - 1,2 - 0,...,0 - 1,1 - 1,16_04_23,-1,-3,-1,-1,2945,0,1
2949,Bolatice,Долни Датыне,1-1,1 - 1,0 - 0,Вратимов,Bolatice,5-2,3 - 2,2 - 0,...,1 - 2,0 - 0,16_04_23,-1,-3,-1,-1,2949,1,1


In [168]:
## предсказання майбутніх матчів 
time_target = 'future' 
# day_target = datetime(year=2023, month=4, day=9)

df_future = pd.read_csv('csv/future/26_04_23_f.csv')
df_np_future = create_np_from_df(df_future ,time_target='future')
a = check_predict(
                df_np=df_np_future,
                list_rez=rez,
                time_target='future'
)
a

[] -3 -4 1 2
[] -2 -4 2 2
[] -2 -2 1 -1
[] -1 1 0 2
[] 0 -1 -1 2
[205] 0 -1 1 4
[22] 0 0 1 4
[] 2 2 1 3
[] 3 4 1 0


{'tb': 0, 'tm': 0, 'total_games': 0, 'indexes': [205, 22]}

In [169]:
df_future.loc[a['indexes']]

Unnamed: 0,cur_match_h_team,cur_match_a_team,cur_match_score,cur_match_f_half,cur_match_s_half,h_match_h_team,h_match_a_team,h_match_score,h_match_f_half,h_match_s_half,a_match_h_team,a_match_a_team,a_match_score,a_match_f_half,a_match_s_half,rez_h_f_half,rez_h_match,rez_a_f_half,rez_a_match,index
205,Пелистер,Караорман,without_score,without_score,without_score,Беласица,Пелистер,2-1,1 - 1,1 - 0,Караорман,Борец,4-0,1 - 0,3 - 0,0,-1,1,4,205
22,ЦСКА Ереван,Ван,without_score,without_score,without_score,Ширак,ЦСКА Ереван,1-1,0 - 0,1 - 1,Ван,Лернаин Арцах,4-0,1 - 0,3 - 0,0,0,1,4,22


In [243]:
rez

[[-1, -3, -1, -1], [0, -2, -1, -1], [2, 2, 0, 0]]