In [9]:
import sys
import time
import os
import asyncio
import numpy as np
import pandas as pd
import json
import traceback
from typing import List, Dict

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver

from fake_useragent import UserAgent

from IPython.display import clear_output

from parsers import DataParser
from functions import extract_text

In [40]:
def extract_attribute(dct: Dict, attr: str) -> List:
    res = []    
    if attr in dct:
        res.append(dct[attr])
        
    if dct['children']:
        for x in dct['children']:
            res.append(extract_attribute(x, attr))
            
    return res


def extract_text(dct: Dict) -> List:
    res = []    
    if 'text' in dct:
        res.append(dct['text'].strip())
        
    if dct['children']:
        for x in dct['children']:
            res.append(extract_text(x))
    
    return res


def simplify_list(lst):
    if type(lst) == type('s'):
        return lst        
    if lst == []:
        return None
    if len(lst) == 1:
        return simplify_list(lst[0])
    
    res = []
    
    for el in lst:        
        res.append(simplify_list(el))
        
    return res


def selenium_to_dict(web_elem):
    parser = DataParser()
    parser.feed(web_elem.get_attribute('innerHTML'))
    return parser.data


def get_unique_headers(headers_row):
    headers = []
    for el in headers_row:
        if el not in headers:
            headers.append(el)
        else:
            headers.append(el + ('*'))
    return headers


def fork_percentage(coeff_1, coeff_2):
    res = 1 - (1 / float(coeff_1) + 1 / float(coeff_2))
    return f'{round(res * 100, 1)}%'


def check_event(event, opposite_event):
    res = []

    for x in df_joined.loc[:, [event, opposite_event]].values:
        try:
            res.append(fork_percentage(*x))
        except Exception as exc:
            print(exc)
            res.append(None)
            
    return res


async def parse_block(root, features_cnt):
    text_tmp = extractText(block)
    text = simplifyList(text_tmp)

    headers_row = text[1][3] #unique for every bookmaker

    headers = getUniqueHeaders(headers_row)

    if not columns_added_flg:
        df = pd.DataFrame(columns=['Player 1', 'Player 2'] + headers)
        columns_added_flg = True

    return df

In [42]:
simplify_list(['', 'g', 'd', [[[['b']], 't']]])

['', 'g', 'd', ['b', 't']]

In [31]:
class Bookmaker:
    def __init__(self, url):
        self.url = url
        
    def start_driver(self):
        chrome_options = Options()

        chrome_prefs = {}
        chrome_prefs["profile.default_content_settings"] = { "popups": 1 }

        driver = webdriver.Chrome(options=chrome_options, executable_path="./chromedriver")
        driver.get(self.url)
        driver.maximize_window()
        
        self.driver = driver
        
    def close_driver(self):
        self.driver.close()

In [32]:
def get_driver(url, proxy=False):
    chrome_options = Options()   
    ua = UserAgent()
    a = ua.random
    user_agent = ua.random
    print(user_agent)
    chrome_options.add_argument(f'user-agent={user_agent}')
    
    chrome_prefs = {}
    chrome_prefs["profile.default_content_settings"] = { "popups": 1 }
    
    if proxy:
        chrome_options.add_extension("configures.zip")

    driver = webdriver.Chrome(options=chrome_options, executable_path="./chromedriver")
    driver.get(url)
    driver.maximize_window()
    
    return driver

In [35]:
fonbet = get_driver('https://www.fonbet.ru/live/')
xstavka = get_driver('https://1xstavka.ru/en/live/')

Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36


In [38]:
fonbet_root = WebDriverWait(fonbet, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="table__flex-container"]'))
    ).find_elements_by_xpath('./*')[0]

xstavka_root = WebDriverWait(xstavka, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[@class="game_content_line on_main live-content "]/div/div/div/div'))
    )

In [41]:
data = selenium_to_dict(xstavka_root)
text_tmp = extract_attribute(data, 'title')
text = simplify_list(text_tmp)
canonical_headers = text[0][0][-1]
canonical_headers

['1',
 'Draw',
 '2',
 'W1 or Draw',
 'W1 or W2',
 'Draw or W2',
 'Total over',
 'Total parameter',
 'Total under',
 'Team 1 Handiсap',
 'Handicap parameter',
 'Team 2 Handicap',
 'Team 1 Total Over',
 'Team Total value',
 'Team 1 Total Under',
 'Team 2 Total Over',
 'Team Total value',
 'Team 2 Total Under']

In [43]:
def fonbet_table():
    data = selenium_to_dict(fonbet_root)

    blocks = data['children'][0]['children']

    features_cnt = 14
    df = pd.DataFrame()
    columns_added_flg = False

    for block in blocks:
        text_tmp = extract_text(block)
        text = simplify_list(text_tmp)
        
        headers_row = text[0][-features_cnt:]

        headers = get_unique_headers(headers_row)

        if not columns_added_flg:
            df = pd.DataFrame(columns=['Player 1', 'Player 2'] + headers)
            columns_added_flg = True

        rows = text[1:]

        for row in rows:
            try:
                players, coeffs = row[1][0][1].split('—'), row[-features_cnt:] #костыль для игроков

                if len(players) == 2: #только строки, в которых ставки на основные исходы матча
                    player_1_tmp, player_2_tmp = players

                    player_1, player_2 = player_1_tmp.strip(), player_2_tmp.strip()

                    if len(headers) == len(coeffs):
                        columns = ['Player 1', 'Player 2'] + headers
                        data = [player_1, player_2] + coeffs

                        df = df.append(dict(zip(columns, data)), ignore_index=True)

            except Exception as exc:
                print(exc)
    return df

In [44]:
def xstavka_table():
    data = selenium_to_dict(xstavka_root)
    
    blocks = data['children'][0]['children']

    features_cnt = 18
    df = pd.DataFrame()
    columns_added_flg = False

    for block in blocks:

        text_tmp = extract_text(block)
        text = simplify_list(text_tmp)

        headers_row = text[1][-1]

        headers = get_unique_headers(headers_row)

        if not columns_added_flg:
            df = pd.DataFrame(columns=['Player 1', 'Player 2'] + headers)
            columns_added_flg = True

        rows = text[2:]

        for row in rows:
            try:
                players_tmp = row[1][1][1][2]
                player_1_tmp, player_2_tmp = players_tmp[1][1], players_tmp[2][1]

                player_1, player_2 = player_1_tmp.strip(), player_2_tmp.strip()

                coeffs = row[1][-1]

                if len(headers) == len(coeffs):
                    columns = ['Player 1', 'Player 2'] + headers
                    data = [player_1, player_2] + coeffs

                    df = df.append(dict(zip(columns, data)), ignore_index=True)

            except Exception as exc:
                pass #надо подкорректировать
                
    return df

In [46]:
for _ in range(50):
    fonbet_root = WebDriverWait(fonbet, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="table__flex-container"]'))
        ).find_elements_by_xpath('./*')[0]

    xstavka_root = WebDriverWait(xstavka, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="game_content_line on_main live-content "]/div/div/div/div'))
        )
    
    start_time = time.time()
    df_fonbet = fonbet_table()
    df_xstavka = xstavka_table()

    df_fonbet_short = df_fonbet.iloc[:, range(8)]
    df_xstavka_short = df_xstavka.iloc[:, range(8)]

    df_joined = df_fonbet_short.merge(df_xstavka_short,  on='Player 1')

    first, second = '1_x', '2X'
    df_joined.loc[:, f'Fork: {first} - {second}'] = check_event(first, second)
    display(df_joined)
    print(time.time() - start_time)
    clear_output(wait=True)

Unnamed: 0,Player 1,Player 2_x,1_x,X_x,2_x,1X_x,12_x,X2,Player 2_y,1_y,X_y,2_y,1X_y,12_y,2X,Fork: 1_x - 2X
0,Arsenal Tula,Akhmat,2.5,3.15,3.05,1.38,1.35,1.53,Akhmat,2.632,3.14,2.91,1.45,1.4,1.525,-5.6%
1,Ordabasy,Shakhter Karaganda,1.55,3.4,8.5,1.05,1.3,2.35,Shakhter Karagandy,1.53,3.32,8.68,1.05,1.305,2.416,-5.9%
2,Academica Clinceni,Sepsi OSK Sfantu Gheorghe,3.25,2.7,2.55,1.48,1.45,1.32,ACS Sepsi OSK Sfantul Gheorghe,3.47,2.65,2.55,1.504,1.475,1.304,-7.5%


1.4053239822387695
