In [1]:
import requests
import re
import numpy as np
import pandas as pd
import time

link = 'https://www.hltv.org/matches/2346801/skade-vs-mibr-snow-sweet-snow-2'

# получаем страницу
def get_request(link):
    headers_initial = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'no-cache',
        'upgrade-insecure-requests': '1',
    } 
    return requests.get(link, headers=headers_initial).text

# получаем названия команд
def get_team_names(request):
    pattern_findteam = r"class=\"teamName\">([\w\s\.-]+)<"
    team_names = re.findall(pattern_findteam, request)
    return team_names[0], team_names[1]
    
# получаем страницы команд
def get_team_pages(request):
    pattern_team1 = r"<div class=\"team1-gradient\"><a href=\"([\w/-]+)\""
    team1_link = 'https://www.hltv.org' + re.findall(pattern_team1, request)[0] 
    pattern_team2 = r"<div class=\"team2-gradient\"><a href=\"([\w/-]+)\""
    team2_link = 'https://www.hltv.org' + re.findall(pattern_team2, request)[0]
    return get_request(team1_link), get_request(team2_link)

# функция по определению ранга команды
def get_rank(team_page):
    pattern_findrank = r"<a href=\"/ranking/teams\">#([\w-]+)<"
    return re.findall(pattern_findrank, team_page)[0]

# получаем страницу аналитики
def get_analytics_page(request):
    pattern_analytics_hltv = r"<a href=\"(.*)\" class=\"matchpage-analytics-center-container\">"
    analytics_hltv_link = 'https://www.hltv.org' + re.findall(pattern_analytics_hltv, request)[0]
    return get_request(analytics_hltv_link)

# получаем формы игроков двух команд
def get_players_forms(analytics_page):
    pattern_players_form1 =  r"<div class=\"analytics-head-to-head-container team1\">([\s\S]*?)<div class=\"analytics-last-matches\">"    
    players_form1 = re.findall(pattern_players_form1, analytics_page)[0]
    pattern_players_form2 =  r"<div class=\"analytics-head-to-head-container team2\">([\s\S]*?)<div class=\"analytics-last-matches\">"
    players_form2 = re.findall(pattern_players_form2, analytics_page)[0]
    return players_form1, players_form2

# получаем форму команды за 3 месяца
def get_form_latest(players_form):
    pattern_form_latest = r"<td class=\"table-3-months[\s\w]*?\">(.*)</td>"
    form_latest = list()
    for i in re.findall(pattern_form_latest, players_form):
        form_latest.append(float(i))
    return form_latest

# получаем форму команды на текущем турнире
def get_form_event(players_form):
    pattern_form_event = r"<td class=\"table-event[\s\w]*?\"[\s\S]*?>(.*)</td>"
    if re.findall(pattern_form_event, players_form)[0] == '-':
        return get_form_latest(players_form)
    else:
        form_event = list()
        for i in re.findall(pattern_form_event, players_form):
            form_event.append(float(i))
        return(form_event)

# получаем статистики формы команды
def get_form_statistics(list_form):
    avg_form = sum(list_form) / len(list_form)
    form_range = max(list_form) - min(list_form)
    diffs = 0
    for n in list_form:
        diffs += (n - avg_form)**(2)
    sd_form = (diffs/(len(list_form)-1))**(0.5)
    return avg_form, form_range, sd_form

# получаем статистику по раундам в игре
def get_avg_rounds(analytics_page):
    match_pattern =  r"<div class=\"analytics-handicap-map-data-table-container\">([\s\S]*?)</table>"    
    matchs = re.findall(match_pattern, analytics_page)
    return matchs[0], matchs[1]

# получаем словарь с названиями карт и средние раунды при победах и поражениях
def get_avg_rounds_maps(avg_rounds):
    map_pattern = r"<tr>([\s\S]*?)</tr>"
    maps = re.findall(map_pattern, avg_rounds)
    map_name_pattern = r"<div class=\"mapname\">([\s\S]*?)</div>"
    avg_rounds_pattern = r"<td class=\"analytics-handicap-map-data-avg\">([\s\S]*?)</td>"
    maps_name = {}
    for i in maps:
        if len(re.findall(map_name_pattern, i)) > 0:
            maps_name[re.findall(map_name_pattern, i)[0]] = re.findall(avg_rounds_pattern, i)
    return maps_name

# получаем страницу с искомой картой (с мелкой буквы)
def score_page(maps_name, team_page):
    pattern_stats_team = r"<a href=\"(/stats/teams/map/.*)\" class=\"moreButton\""
    link_maps = re.findall(pattern_stats_team, team_page)
    card_code = {'dust2':'/31/', 'mirage':'/32/', 'inferno':'/33/', 'nuke':'/34/', 'train':'/35/', 'overpass':'/40/', 'vertigo':'/46/'}
    pattern_code = r"(/\d\d/)"
    link = 'https://www.hltv.org' + link_maps[0]
    score_link = re.sub(pattern_code, card_code[maps_name], link)
    score_link = score_link.replace("amp;", "")
    score_page = get_request(score_link)
    return score_page

# получаем процент выигранных игр на карте
def win_percent_map(score_page):
    pattern_win_percent = r"<div class=\"stats-row\"><span class=\"strong\">Win percent</span><span class=\"\w*\">([\w\.%]*)<"
    if len(re.findall(pattern_win_percent, score_page)) > 0:
        win_percent = re.findall(pattern_win_percent, score_page)[0].replace('%','')
    else:
        win_percent = 0
    return float(win_percent)

# получаем процент выигранных пистолеток
def win_pistol_rounds(score_page):
    pattern_pistol_round_win_percent = r"<div class=\"stats-row\"><span class=\"strong\">Pistol round win percent</span><span class=\"\w*\">([\w\.%]*)<"
    if len(re.findall(pattern_pistol_round_win_percent, score_page)) > 0:
        pistol_round_win_percent = re.findall(pattern_pistol_round_win_percent, score_page)[0].replace('%','')
    else:
        pistol_round_win_percent = 0
    return float(pistol_round_win_percent)

# получаем распределение долей общих выигранных карт, за кт и за т
def distribution_of_won_rounds(score_page):
    pattern_total_rounds_played = r"<div class=\"stats-row\"><span class=\"strong\">Total rounds played</span><span>(\w*)<"
    if len(re.findall(pattern_total_rounds_played, score_page)) > 0:
        total_rounds_played = re.findall(pattern_total_rounds_played, score_page)[0]
    else:
        total_rounds_played = 0
    patter_rounds_won = r"<div class=\"stats-row\"><span class=\"strong\">Rounds won</span><span>(\w+)<"
    if len(re.findall(patter_rounds_won, score_page)) > 0:
        rounds_won = re.findall(patter_rounds_won, score_page)[0]
    else:
        rounds_won = 0
    pattern_distribution_of_won_rounds = r"data&quot;:\[\{&quot;label&quot\;\:&quot;\w*&quot;,&quot;value&quot;:&quot;(\w*)&quot;,&quot;color&quot;:&quot;2C6EA4&quot;\},\{&quot;label&quot;:&quot;\w*&quot;,&quot;value&quot;:&quot;(\w*)&quot;"
    distribution_of_won_rounds = re.findall(pattern_distribution_of_won_rounds, score_page)
    if int(total_rounds_played) != 0:
        win_round_percent = round((int(rounds_won) / int(total_rounds_played) * 100), 1)
        win_ct = round((int(distribution_of_won_rounds[0][0]) / (int(distribution_of_won_rounds[0][0]) + int(distribution_of_won_rounds[0][1])) * 100), 2)
        win_t = round((int(distribution_of_won_rounds[0][1]) / (int(distribution_of_won_rounds[0][0]) + int(distribution_of_won_rounds[0][1])) * 100), 2)
    else:
        win_round_percent = 0
        win_ct = 0
        win_t = 0
    return [win_round_percent, win_ct, win_t]

# формирование словаря с информацией по карте
def creature_info(map_name, team_page):
    d = {}
    score = score_page(map_name.lower(), team_page)
    l = distribution_of_won_rounds(score)
    l.append(win_percent_map(score))
    l.append(win_pistol_rounds(score))
    d[map_name] = l
    return d
    
# формирование датафрейма с парамтерами для карты map_name
def creature_df(map_name):
    d = {'link': [link], 'rounds': [0], 'win': [0],
         'team1_rank': [team1_rank], 'team1_form_avg_latest': [team1_form_avg_latest], 'team1_form_range_latest': [team1_form_range_latest], 
         'team1_form_sd_latest': [team1_form_sd_latest], 'team1_form_avg_event': [team1_form_avg_event], 'team1_form_range_event': [team1_form_range_event], 
         'team1_form_sd_event': [team1_form_sd_event], 'team1_win_round_percent': team1_map_info[map_name][0], 
         'team1_win_ct': team1_map_info[map_name][1], 'team1_win_t': team1_map_info[map_name][2],
         'team1_win_percent_map': team1_map_info[map_name][3], 'team1_win_pistol_rounds': team1_map_info[map_name][4],
         'team1_lost_in_wins': [float(team1_avg_rounds[map_name][0].replace('-', '0'))], 
         'team1_won_in_losses': [float(team1_avg_rounds[map_name][1].replace('-', '0'))], 
         'team2_rank': [team2_rank], 'team2_form_avg_latest': [team2_form_avg_latest], 'team2_form_range_latest': [team2_form_range_latest], 
         'team2_form_sd_latest': [team2_form_sd_latest], 'team2_form_avg_event': [team2_form_avg_event], 'team2_form_range_event': [team2_form_range_event], 
         'team2_form_sd_event': [team2_form_sd_event], 'team2_win_round_percent': team2_map_info[map_name][0], 
         'team2_win_ct': team2_map_info[map_name][1], 'team2_win_t': team2_map_info[map_name][2],
         'team2_win_percent_map': team2_map_info[map_name][3], 'team2_win_pistol_rounds': team2_map_info[map_name][4],
         'team2_lost_in_wins': [float(team2_avg_rounds[map_name][0].replace('-', '0'))], 
         'team2_won_in_losses': [float(team2_avg_rounds[map_name][1].replace('-', '0'))]}
    df = pd.DataFrame(d)
    return df

# перезапись датафрейма
def rewrite_csv(map_name):
    df1 = pd.read_csv(map_name+'.csv')
    df2 = creature_df(map_name)
    df = pd.concat([df1, df2])
    df.to_csv(map_name+'.csv', index=False)

# получаем страницы матча и страницы команд
request = get_request(link)
team1_page, team2_page = get_team_pages(request)

# названия команд
team1, team2 = get_team_names(request)

# ранги команд
team1_rank = int(get_rank(team1_page))
team2_rank = int(get_rank(team2_page))

# получаем страницу аналитики и формы игроков команд
analytics_page = get_analytics_page(request)
players_form1, players_form2 = get_players_forms(analytics_page)

# статистики (среднее, размах, отклонение) команд за последние 3 месяца
team1_form_latest = get_form_latest(players_form1)
team2_form_latest = get_form_latest(players_form2)
team1_form_avg_latest, team1_form_range_latest, team1_form_sd_latest = get_form_statistics(team1_form_latest)
team2_form_avg_latest, team2_form_range_latest, team2_form_sd_latest = get_form_statistics(team2_form_latest)

# статистики (среднее, размах, отклонение) команд на текущем турнире
team1_form_event = get_form_event(players_form1)
team2_form_event = get_form_event(players_form2)
team1_form_avg_event, team1_form_range_event, team1_form_sd_event = get_form_statistics(team1_form_event)
team2_form_avg_event, team2_form_range_event, team2_form_sd_event = get_form_statistics(team2_form_event)

# получаем статистику команд по сыгранным раундам 
avg_rounds1, avg_rounds2 = get_avg_rounds(analytics_page)
team1_avg_rounds = get_avg_rounds_maps(avg_rounds1)
team2_avg_rounds = get_avg_rounds_maps(avg_rounds2)
    
print('Все ок')

Все ок


In [2]:
for i in ['Dust2', 'Mirage', 'Inferno', 'Nuke', 'Train', 'Overpass', 'Vertigo']:
    team1_map_info = creature_info(i, team1_page)
    team2_map_info = creature_info(i, team2_page)
    rewrite_csv(i)
    df = pd.read_csv(i + '.csv')
    #print(df)
    time.sleep(100)
    print(i, '- готово')
print('Парсинг завершен!')

Dust2 - готово
Mirage - готово
Inferno - готово
Nuke - готово
Train - готово
Overpass - готово
Vertigo - готово
Парсинг завершен!
