# Парсинг данных

In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
import random
from tqdm import tqdm
from tqdm.notebook import trange, tqdm

In [35]:
# Some links are broken
def remove_broken_urls(arr):
    pattern = re.compile(r'^https://www.futwiz.com/en/fifa23/player//.*$')
    mask = np.vectorize(lambda x: bool(pattern.match(x)))
    return arr[~mask(arr)]

In [36]:
# Parsing all info about the card
failed_hrefs = []
def parsing_card_info(hrefs):
    global failed_hrefs
    players = []
    for href in tqdm(hrefs):
        res = requests.get(href)
        tree = BeautifulSoup(res.content, 'html.parser')
        if tree.find('div', {'class': 'pname-h1'}) == None: # Sometimes it breaks, so we will remember the failed links
            failed_hrefs.append(href)
            continue
        pars = [item.div.text.strip() for item in tree.find_all('div', {'class': 'player-info-stat'})]
        rats = [item.text.strip() for item in tree.find_all('div', {'class': 'headline-stat-num'})]
        name = tree.find('div', {'class': 'pname-h1'}).find('h1').text.split()
        price = int(''.join(tree.find('div', {'class': 'price-num'}).text.split(',')))
        cln = tree.find('div', {'class': 'pname-club'}).find_all('a')
        general_rat = int(tree.find('div', {'class': 'card-23-rating'}).text.strip())
        full_stats_labels = [item.text for item in tree.find_all('div', {'class' : 'individual-stat-bar-label'})]
        full_stats = {} 

        # Skip goalkeepers (they have different stats)
        if pars[8] == 'GK':
            continue

        for i, stat in enumerate(full_stats_labels):
            if stat == 'Long Shots':
                stat = 'longshot'
            if stat == 'Interceptions':
                stat = 'tactaware'
            if stat == 'Def. Awareness':
                stat = 'marking'
            if stat == 'Stand Tackle':
                stat = 'standingtackle'

            stat = stat.lower().replace(" ", "") + 'stat'
            stat = re.sub(r'[^\w\s]', '', stat)

            stat_label = full_stats_labels[i]
            stat_rate = tree.find_all('div', {'class' : f'individual-stat-bar-stat textcolour {stat}'})[0].text

            full_stats[stat_label] = stat_rate

        stats = {
            'Name': ' '.join(name[:name.index('FIFA')]),
            'Card Type': ' '.join(name[name.index('23')+1:]),
            'Nation': cln[0].text.strip(),
            'Club': cln[1].text.strip(),
            'League': cln[-1].text.strip(),
            'Skills': pars[0],
            'W/F': pars[1],
            'W/R': pars[2],
            'Foot': pars[3],
            'Age': pars[4],
            'Height': pars[5],
            'Weight': pars[6],
            'Body Type': pars[7],
            'Position': pars[8],
            'Alt. Position': pars[9],
            'Rating': general_rat,
            'PAC': rats[0],
            'SHO': rats[1],
            'PAS': rats[2],
            'DRI': rats[3],
            'DEF': rats[4],
            'PHY': rats[5],
            'Price': price}
        stats.update(full_stats)
        players.append(stats)
    return players

In [43]:
# Get top five league hrefs
top_five_league_hrefs = []
for league in tqdm([13, 16, 31, 19, 53]):
    league_hrefs = []
    for page in range(8):
        url = f'https://www.futwiz.com/en/fifa23/players?page={page}&leagues[]={league}'
        res = requests.get(url)
        tree = BeautifulSoup(res.content, 'html.parser')
        my_list = tree.find_all('div', {'class': "col-2"})[1:-2]
        hrefs = [f'https://www.futwiz.com{item.a.get("href")}' for item in my_list]
        league_hrefs += hrefs
    top_five_league_hrefs += league_hrefs
top_five_league_players = parsing_card_info(remove_broken_urls(np.array(top_five_league_hrefs)))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [37]:
# Get "bitch" league hrefs
bitch_league_hrefs = []
for league in tqdm([10, 14, 308, 39, 68, 1003]):
    league_hrefs = []
    for page in range(2):
        url = f'https://www.futwiz.com/en/fifa23/players?page={page}&leagues[]={league}'
        res = requests.get(url)
        tree = BeautifulSoup(res.content, 'html.parser')
        my_list = tree.find_all('div', {'class': "col-2"})[1:-2]
        hrefs = [f'https://www.futwiz.com{item.a.get("href")}' for item in my_list]
        league_hrefs += hrefs
    bitch_league_hrefs += league_hrefs
bitch_league_players = parsing_card_info(remove_broken_urls(np.array(bitch_league_hrefs)))

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

In [38]:
# Get icons hrefs
icons_hrefs = []
for page in trange(10):
    url = f'https://www.futwiz.com/en/fifa23/players?page={page}&leagues[]=2118'
    res = requests.get(url)
    tree = BeautifulSoup(res.content, 'html.parser')
    my_list = tree.find_all('div', {'class': "col-2"})[1:-2]
    hrefs = [f'https://www.futwiz.com{item.a.get("href")}' for item in my_list]
    icons_hrefs += hrefs
icons_players = parsing_card_info(remove_broken_urls(np.array(icons_hrefs)))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

In [39]:
#Get base cards hrefs
base_cards_hrefs = []
for type_of_card in tqdm(['raregold', 'commongold', 'nifsilver', 'nifbronze']):
    type_hrefs = []
    for page in range(4):
        url = f'https://www.futwiz.com/en/fifa23/players?page={page}&release={type_of_card}'
        res = requests.get(url)
        tree = BeautifulSoup(res.content, 'html.parser')
        my_list = tree.find_all('div', {'class': "col-2"})[1:-2]
        hrefs = [f'https://www.futwiz.com{item.a.get("href")}' for item in my_list]
        type_hrefs += hrefs
    base_cards_hrefs += type_hrefs
base_cards_players = parsing_card_info(remove_broken_urls(np.array(base_cards_hrefs)))

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/399 [00:00<?, ?it/s]

In [51]:
# Parsing failed linkes
failed_players = []
while len(failed_hrefs) > 0:
    failed_hrefs_hash = failed_hrefs.copy()
    failed_hrefs.clear()
    failed_players += parsing_card_info(remove_broken_urls(np.array(failed_hrefs_hash)))

In [30]:
players = top_five_league_players + bitch_league_players + icons_players + base_cards_players + failed_players

In [56]:
df = pd.DataFrame(players)
df.drop_duplicates()
df

Unnamed: 0,Name,Card Type,Nation,Club,League,Skills,W/F,W/R,Foot,Age,...,Composure,Interceptions,Heading Acc.,Def. Awareness,Stand Tackle,Slide Tackle,Jumping,Stamina,Strength,Aggression
0,Erling Haaland,TOTS,Norway,Manchester City,ENG 1,4,5,H/M,Left,22,...,96,54,98,55,66,36,82,89,99,94
1,Kevin De Bruyne,TOTS,Belgium,Manchester City,ENG 1,4,5,H/H,Right,31,...,97,88,73,90,86,70,71,99,84,85
2,Kevin De Bruyne,TOTY,Belgium,Manchester City,ENG 1,4,5,H/H,Right,31,...,96,85,70,86,82,67,70,97,84,83
3,Mohamed Salah,TOTS,Egypt,Liverpool,ENG 1,5,3,H/M,Left,30,...,99,66,70,46,52,48,76,96,83,70
4,Ruben Dias,TOTS,Portugal,Manchester City,ENG 1,2,4,M/H,Right,25,...,99,94,96,99,98,94,87,92,99,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,Magnus Lekven,Non-Inform,Norway,Odds BK,NOR 1,2,4,M/M,Right,34,...,71,61,59,62,60,55,72,75,70,70
1790,Arda Kizildag,Non-Inform,Turkey,Gaziantep,TUR 1,2,3,M/M,Right,24,...,57,63,69,68,68,62,72,65,64,62
1791,Dane Scarlett,Non-Inform,England,Portsmouth,ENG 3,3,4,H/M,Right,18,...,54,13,64,17,16,18,76,51,56,57
1792,Marvin Stefaniak,Non-Inform,Germany,FC Erzgebirge Aue,GER 3,3,4,H/M,Right,27,...,57,27,40,30,25,26,69,67,64,41


In [None]:
# Сохраняем датафрейм на гитхаб
import os
from getpass import getpass

os.environ['GITHUB_USERNAME'] = input('Введите ваш логин на GitHub: ')
os.environ['GITHUB_PASSWORD'] = getpass('Введите ваш пароль на GitHub: ')
username = os.environ['GITHUB_USERNAME']
password = os.environ['GITHUB_PASSWORD']

In [123]:
import os
os.environ['GIT_PYTHON_GIT_EXECUTABLE'] = r'C:\Program Files\Git\cmd\git.exe'
from git import Repo

In [124]:
df.to_csv('players.csv')

In [None]:
path_to_repo = '\\Users\\ASUS\\python_copybooks\\AD_HSE\\project\\FIFA-Analysis\\'
file_name = 'players.csv'
with open(file_name, 'r') as f:
    file_content = f.read()

repo = Repo(path_to_repo)
remote_url = repo.remote().url
new_branch = repo.create_head('new_branch')
new_branch.checkout()
with open(file_name, 'w') as f:
    f.write(file_content)
repo.index.add([file_name])
repo.index.commit('Add my_file.csv')
repo.remote().push(refspec='{}:{}'.format(new_branch, new_branch), force=True)
repo.head.reset(index=True, working_tree=True)
