In [7]:
import requests
import urllib.parse
import xml.etree.ElementTree as ET
import pandas as pd
import os
import glob
import re

In [8]:
def process_tournament_statistics(tournament_no):
    # Define the API endpoint
    endpoint = "https://www.fivb.org/vis2009/XmlRequest.asmx"

    # Step i: Get a list of all matches in the tournament
    GetMatchRequest = f"""
    <Request Type="GetBeachMatchList" Fields="NoInTournament LocalDate LocalTime TeamAName TeamBName Court MatchPointsA MatchPointsB PointsTeamASet1 PointsTeamBSet1 PointsTeamASet2 PointsTeamBSet2 PointsTeamASet3 PointsTeamBSet3 DurationSet1 DurationSet2 DurationSet3">
        <Filter NoTournament="{tournament_no}" InMainDraw="true"/>
    </Request>
    """
    encoded_request = urllib.parse.quote(GetMatchRequest)
    url = f"{endpoint}?Request={encoded_request}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to fetch matches. HTTP Status Code: {response.status_code}")
        return None

    root = ET.fromstring(response.text)
    matches = []
    column_names = []
    first_match = root.find(".//BeachMatch")
    if first_match is not None:
        column_names = list(first_match.attrib.keys())
        for match in root.findall(".//BeachMatch"):
            row = {col: match.attrib.get(col, "") for col in column_names}
            matches.append(row)
    else:
        print("No matches found in the tournament.")
        return None

    matches_df = pd.DataFrame(matches)

    # Step ii: Get statistics for all matches
    match_statistics = []
    for match_no in matches_df["No"].unique():
        MatchStatisticRequest = f"""
        <Requests>
            <Request Type='GetBeachStatisticList' Fields='ItemType NoItem NoSet SpikeFault SpikePoint ServeFault ServePoint ServeTotal BlockPoint BlockTotal DigTotal NoMatch PointTotal ReceptionFault SpikeTotal TeamFault'>
                <Filter Type='VolleyStatisticFilter' NoMatches='{match_no}' />
            </Request>
        </Requests>
        """
        encoded_request = urllib.parse.quote(MatchStatisticRequest)
        url = f"{endpoint}?Request={encoded_request}"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to fetch statistics for match {match_no}. HTTP Status Code: {response.status_code}")
            continue

        root = ET.fromstring(response.text)
        first_stat = root.find(".//VolleyStatistic")
        if first_stat is not None:
            stat_columns = list(first_stat.attrib.keys())
            for stat in root.findall(".//VolleyStatistic"):
                row = {col: stat.attrib.get(col, "") for col in stat_columns}
                match_statistics.append(row)

    statistics_df = pd.DataFrame(match_statistics)

    # Step iii: Retrieve a list of all players and teams
    unique_items = statistics_df["NoItem"].unique()
    players_teams = []
    for item_no in unique_items:
        GetPlayerRequest = f"""
        <Request Type="GetPlayer" No="{item_no}" Fields="FederationCode FirstName Gender LastName Nationality PlaysBeach PlaysVolley TeamName" />
        """
        encoded_request = urllib.parse.quote(GetPlayerRequest)
        url = f"{endpoint}?Request={encoded_request}"
        response = requests.get(url)

        if response.status_code == 200:
            root = ET.fromstring(response.text)
            if root.attrib:
                players_teams.append(root.attrib)
        else:
            GetTeamRequest = f"""
            <Request Type="GetBeachTeam" No="{item_no}" Fields="NoPlayer1 NoPlayer2 Name Rank EarnedPointsTeam" />
            """
            encoded_request = urllib.parse.quote(GetTeamRequest)
            url = f"{endpoint}?Request={encoded_request}"
            response = requests.get(url)

            if response.status_code == 200:
                root = ET.fromstring(response.text)
                if root.attrib:
                    players_teams.append(root.attrib)

    players_teams_df = pd.DataFrame(players_teams)

    # Step iv: Match the player/team dataframe to the statistics dataframe
    merged_df = statistics_df.merge(players_teams_df, left_on="NoItem", right_on="No", how="left")

    #return [merged_df, matches_df]
    return merged_df

In [9]:
tournament_statistics = process_tournament_statistics(7642)
print(tournament_statistics)
#im tournament 7497 sind daten enthalten
# if tournament_statistics is not None:
#     # Write the DataFrame to a CSV file
#     tournament_statistics.to_csv("tournament_statistics.csv", index=False, sep="\t", quoting=1)  # quoting=1 ensures fields are enclosed in quotes
#     print("Tournament statistics saved to 'tournament_statistics.csv'.")
# else:
#     print("No tournament statistics to save.")

    ItemType   NoItem NoSet SpikeFault SpikePoint ServeFault ServePoint  \
0         30   147073                5         17          0          1   
1         30   139087                3         10          1          2   
2         30   151097                4         15          1          1   
3         30   162679                0          6          3          1   
4         11  3137838                8         27          1          3   
..       ...      ...   ...        ...        ...        ...        ...   
319       30   139087                4         20          2          1   
320       30   124979                5         12          1          3   
321       30   141868                3         25          2          1   
322       11  3137838               10         40          3          1   
323       11  3137841                8         37          3          4   

    ServeTotal BlockPoint BlockTotal  ... PlaysBeach PlaysVolley  \
0           19          0      

In [6]:
tournament_statistics[0]

Unnamed: 0,ItemType,NoItem,NoSet,SpikeFault,SpikePoint,ServeFault,ServePoint,ServeTotal,BlockPoint,BlockTotal,...,PlaysBeach,PlaysVolley,TeamName,No_y,Version,NoPlayer1,NoPlayer2,Name,Rank,EarnedPointsTeam
0,30,139087,,2,10,3,3,28,0,0,...,1,0,Duda,139087,1478422,,,,,
1,30,141983,,5,18,3,4,27,1,4,...,1,0,Tina,141983,1481315,,,,,
2,30,141984,,4,14,6,0,21,0,1,...,1,1,Anastasija,141984,1480638,,,,,
3,30,147073,,6,20,1,0,25,2,9,...,1,0,Ana Patrícia,147073,1477837,,,,,
4,11,1004185,,0,0,0,0,0,0,0,...,,,,1004185,3399535,147073,139087,Ana Patrícia/Duda,1,1200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,30,139087,,6,29,3,3,34,0,0,...,1,0,Duda,139087,1478422,,,,,
212,30,104079,,5,16,3,0,24,3,8,...,1,0,Carol,104079,1478401,,,,,
213,30,147073,,3,16,2,1,30,3,12,...,1,0,Ana Patrícia,147073,1477837,,,,,
214,11,1004184,,0,0,0,0,0,0,0,...,,,,1004184,3399516,104079,104505,Carol/Barbara,2,1100


In [11]:
tournament_statistics[1]

Unnamed: 0,NoInTournament,LocalDate,LocalTime,TeamAName,TeamBName,Court,MatchPointsA,MatchPointsB,PointsTeamASet1,PointsTeamBSet1,PointsTeamASet2,PointsTeamBSet2,PointsTeamASet3,PointsTeamBSet3,DurationSet1,DurationSet2,DurationSet3,No,Version
0,1,2023-11-23,10:00:00,Ana Patrícia/Duda,Tina/Anastasija,CC,2,1,21,19,16,21,15.0,9.0,1078,1084,628.0,413083,45208897
1,2,2023-11-23,10:00:00,Anouk/Mäder,Talita/Thamela,2,0,2,17,21,19,21,,,1177,1042,,413084,45208898
2,3,2023-11-23,14:00:00,Nuss/Brasher,Tainá/Victoria,CC,2,0,21,15,21,17,,,972,1024,,413085,45208899
3,4,2023-11-23,14:00:00,Ludwig/Lippmann,Andressa/Vitoria,2,1,2,21,19,18,21,10.0,15.0,995,1244,653.0,413086,45208900
4,5,2023-11-23,15:00:00,Stam/Schoon,Agatha/Rebecca,CC,2,1,12,21,21,18,16.0,14.0,851,964,872.0,413087,45208901
5,6,2023-11-23,15:00:00,Müller/Tillmann,Bansley/Bukovec,2,2,0,21,18,21,19,,,950,1248,,413088,45208902
6,7,2023-11-23,09:00:00,Xue/X. Y. Xia,Sliwka/Wachowicz,2,2,0,21,13,21,13,,,943,972,,413089,45208903
7,8,2023-11-23,09:00:00,Carol/Barbara,Placette/Richard,CC,1,2,21,15,18,21,16.0,18.0,1122,1387,1114.0,413090,45208904
8,9,2023-11-23,18:00:00,Ana Patrícia/Duda,Anouk/Mäder,CC,2,1,21,16,16,21,15.0,10.0,965,1050,741.0,413091,45208905
9,10,2023-11-23,18:00:00,Tina/Anastasija,Talita/Thamela,2,2,1,21,17,18,21,15.0,13.0,1010,1189,820.0,413092,45208906


## Informationen zum Tournament

In [3]:
import os
os.chdir('C:/Users/Katharina/Desktop/Weiterbildung/Bootcamp/Bootcamp/Final_project/data')
print(os.getcwd())

C:\Users\Katharina\Desktop\Weiterbildung\Bootcamp\Bootcamp\Final_project\data


### Part1 Tournament - Aufbereitung

In [5]:
Df_tourn1 = pd.read_csv('Tournaments2024_all.csv', sep=';')
Df_tourn2 = pd.read_csv('Tournaments2024_all_part2.csv', sep=';')

DF_tournament_all = pd.concat([Df_tourn1, Df_tourn2])
cols = ['Name','DefaultCity', 'No','Title', 'StartDate', 'StartDateMainDraw',  'EndDateMainDraw', 'CountryName']
DF_tournament_short = DF_tournament_all.loc[:,cols]

In [6]:
#Datei bearbeiten, sodass nur Future, Challenger und Elite enthalten sind
#Der String darf die folgenden Wort enthalten
words = ['Challenge', 'Future', 'Elite', 'BPT']
regex_pattern = '|'.join(words)

df_clean = DF_tournament_short[DF_tournament_short['Title'].str.contains(regex_pattern, case=False, na=False)]
#neu nur BPT -> 2 tournaments


### Part2

In [4]:
tourn1 = pd.read_csv('test_tournament6000_6400.csv', sep=';')
tourn2 = pd.read_csv('test_tournament6400_6900.csv', sep=';')
tourn3 = pd.read_csv('test_tournament6900_7481.csv', sep=';')

tourn6000_7400 = pd.concat([tourn1, tourn2, tourn3], )

In [None]:
words = ['Challenge', 'Future', 'Elite']
regex_pattern = '|'.join(words)
df_clean3 = tourn6000_7400[~tourn6000_7400['Title'].str.contains(regex_pattern, case=False, na=False)]
# cols2 = ['Name','DefaultCity', 'No','Title', 'StartDate', 'StartDateMainDraw',  'EndDateMainDraw']
# df_clean33 = df_clean3.loc[:,cols2]

In [80]:
#anpassen, welche Worte im Namen nicht vorkommen dürfen
#Title
# words2 = ['Etapa', 'AVC', 'Asian', 'National Tour', 'cancelled', 'QTS', 'Timmendorfer Strand', 'Flic en Flac', 'King of the Court', 'Russian', 'Norceca',
# 'National', 'EEVZA', 'Commonwealth', 'Swiss', 'Cavb', 'Hungarian', 'Russia', 'Nacional', 'Geberit', 'Zonal', 'U19', 'France', 'Sudamericano', 'Zonal', 'Pacific', 'Estonian',
# 'OPEN','Italian', 'LIDL', 'African', 'U23', 'NEVZA', 'U18', 'U20', 'Austrian', 'German', 'TVF', 'Summer', 'Hellenic', 'Trinbago', 'Rock', 'Polish', 'Australian', 'U21', 'U22', 'yuh', 'A1',
# 'Japanese', 'Asia', 'CAVA', 'New', 'Senior', 'Australia', 'Military', 'Nations', 'JPN', 'Rannavolle', 'Betcity', 'Zone 5', 'Cyprus', 'Thompsons', 'Mollymook', '1st', '2nd', 'CSVP', 'FISU',
# 'Bolivarianos', 'Heraklion', 'Argentino', 'Test', 'Queen', 'Caribbean', 'Suramericanos']
# regex_patt2 = '|'.join(words2)
# clean4 = df_clean3[~df_clean33['Title'].str.contains(regex_patt2, case=False, na=False)]

filter1 = [0,1,2,3,4,5,6,7,8,10,33,39,40,41,42,]
#clean44 = clean4.loc[clean4['Type'].isin(filter1)]
clean45 = df_clean3.loc[df_clean3['Type'].isin(filter1)]

In [None]:
# no1 = df_clean2['No']
# tourn_numbers =no1[50:]
# tourn_numbers
#tourn_numbers = [6355,6845]
# tourn_numbers = df_clean['No']
#tourn_numbers = [7575,7576]


### Part3

In [75]:
tourn4 = pd.read_csv('test_tournament4000_5000.csv', sep=';')
tourn5 = pd.read_csv('test_tournament5000_6000.csv', sep=';')
tourn6 = pd.read_csv('test_tournament6000.csv', sep=';')

tourn4000_6000 = pd.concat([tourn4, tourn5, tourn6] )

In [76]:
#Unterscheiden nach Type: 
#https://www.fivb.org/VisSDK/VisWebService/BeachTournamentType.html
#tourn4000_6000
filter1 = [0,1,2,3,4,5,6,7,8,10,33,39,40,41,42,]
clean5 = tourn4000_6000.loc[tourn4000_6000['Type'].isin(filter1)]


als erstes df_clean2 tuniere, dann df_clean, dann clean45, dann clean5

In [None]:
# clean5.to_csv('clean5.csv', sep=';', index=False, quoting=1)
# clean45.to_csv('clean45.csv', sep=';', index=False, quoting=1)
# df_clean.to_csv('df_clean.csv', sep=';', index=False, quoting=1)
# df_clean2.to_csv('df_clean2.csv', sep=';', index=False, quoting=1)
#index=False, sep="\t", quoting=1

In [83]:
# no1 = df_clean2['No']
# tourn_numbers =no1[50:]
# tourn_numbers
#tourn_numbers = [6355,6845]
# tourn_numbers = df_clean['No']
#tourn_numbers = [7575,7576]
#tourn_numbers = clean45['No']
tourn_numbers = clean5['No']

#hier nochmal die fehlenden TUniere versuchen zu scrapen



In [18]:
import os
os.chdir('C:/Users/Katharina/Desktop/Weiterbildung/Bootcamp/Bootcamp/Final_project/data')
MissingT = pd.read_csv('missingTournaments.csv', sep=';')

In [21]:
MissingT1 = MissingT['No'].drop_duplicates()


In [23]:
tourn_numbers = MissingT1

In [24]:
# liste von tuniernummern
#tourn_numbers = []
import time
import random 
stat_list = []

#Schleife über tuniere
for no in tourn_numbers:
    try:
        tournament_statistics_df = process_tournament_statistics(no)
        filename = f"tournament_statistics_{no}.csv"

        tournament_statistics_df.to_csv(filename, index=False, sep="\t", quoting=1) 
        print("Tournament statistics saved to 'tournament_statistics_df.csv'.")

        sleep_time = random.uniform(1,8)
        time.sleep(sleep_time)
    except Exception as e:
        print(f"Fehler bei Tunier-No{no}: {e}")

#metadaten abgespeichert unter df_clean_metadaten_Datum

No matches found in the tournament.
Fehler bei Tunier-No7595: 'NoneType' object has no attribute 'to_csv'
No matches found in the tournament.
Fehler bei Tunier-No7596: 'NoneType' object has no attribute 'to_csv'
Tournament statistics saved to 'tournament_statistics_df.csv'.
Tournament statistics saved to 'tournament_statistics_df.csv'.
Tournament statistics saved to 'tournament_statistics_df.csv'.
No matches found in the tournament.
Fehler bei Tunier-No6873: 'NoneType' object has no attribute 'to_csv'
No matches found in the tournament.
Fehler bei Tunier-No6874: 'NoneType' object has no attribute 'to_csv'
No matches found in the tournament.
Fehler bei Tunier-No6875: 'NoneType' object has no attribute 'to_csv'
No matches found in the tournament.
Fehler bei Tunier-No6876: 'NoneType' object has no attribute 'to_csv'
Tournament statistics saved to 'tournament_statistics_df.csv'.
Tournament statistics saved to 'tournament_statistics_df.csv'.
Tournament statistics saved to 'tournament_statis

prüfen, welche dateien erstellt worden sind und welche fehlen

tuniernummer als neue spalte mithinzunehmen

prüfen von welchen tunieren die Statistik-Daten eingelesen worden sind

In [85]:
# Alle passenden CSV-Dateien im aktuellen Verzeichnis finden
csv_files = glob.glob("tournament_statistics_*.csv")

# Leere Liste für alle DataFrames
df_list = []

for file in csv_files:
    df = pd.read_csv(file, sep='\t')
    
    # Zahl aus dem Dateinamen extrahieren
    match = re.search(r'tournament_statistics_(\d+)\.csv', os.path.basename(file))
    if match:
        df["TournamentNo"] = match.group(1)  # Nur die Zahl
    else:
        df["TournamentNo"] = None  # Falls das Muster nicht passt (zur Sicherheit)

    df_list.append(df)

# Alle DataFrames zu einem großen zusammenfügen
combined_df = pd.concat(df_list, ignore_index=True)

# Ergebnis prüfen
#print(combined_df.head())


In [37]:
#Variable NoSet entfernen
MatchStat = combined_df.drop('NoSet', axis=1)

Datensatz MatchStat mit BeachMatchList_full mergen über die MatchNumber

In [38]:
BeachMatches = pd.read_csv('BeachMatchList_full.csv', sep=';')

In [None]:
Combined_Stat = MatchStat.merge(BeachMatches, left_on = 'NoMatch', right_on='@No', how= 'left')

In [89]:
#angepasst
Tournament_2325 = pd.concat([df_clean, df_clean2, clean45, clean5])

In [None]:
#Combined_Stat.info()#object

#Tournament_2325.info()#int
#TournamentNummer in Tournament-Dataset to object umcodieren
Tournament_2325['No'] = Tournament_2325['No'].astype(str)

#Die Variablen, die benötigt werden rausziehen
Tournament_2325.columns

# cols1 = ['Name','DefaultCity', 'No','Title', 'StartDate', 'StartDateMainDraw',  'EndDateMainDraw', 'Code', 'CountryCode', 'CountryName', 'DefaultTimeZone',
#          'DefaultVenue', 'EndDateQualification', 'Gender', 'NoEvent', 'Season', 'StartDateQualification', 'Status', 'WebSite']
# Tournament_SS_2325 = Tournament_2325.loc[:,cols1]


In [61]:
#Dat_comp = Combined_Stat.merge(Tournament_SS_2325, left_on='TournamentNo', right_on='No', how='left')

Dat_comp = Combined_Stat.merge(Tournament_2325, left_on='TournamentNo', right_on='No', how='left')