In [None]:
import requests
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

https://stackoverflow.com/questions/11892729/how-to-log-in-to-a-website-using-pythons-requests-module
https://stackoverflow.com/questions/12203901/pandas-crashes-on-repeated-dataframe-reset-index/12204428

In [None]:
base_url = "http://www.squashinfo.com"

In [None]:
def get_tournament_data_from_url(url):
    """
    url goes to list of tournaments
    extract name, urls, etc. of each tournament in the list
    returns list of lists
    """
    html = requests.get(url).text
    tables = BeautifulSoup(html, 'html.parser').find_all('table')
    rows = tables[1].find_all('tr')[1:]
    data = []

    for row in rows:
        entries = row.find_all('td')

        tournament_type = entries[0].text
        name = entries[1].text
        tournament_url = base_url + entries[1].find_all('a', href=True)[0]['href']
        location = entries[3].text
        date = entries[4].text

        data.append([tournament_type, name, location, date, tournament_url])
    
    return data


def create_frame_of_tournaments():
    columns = ['tournament_type', 'name', 'location', 'date', 'url']
    df_tournaments = pd.DataFrame([], columns = columns)
    
    # men, range(1,142), type = 1
    # women, range(1,79), type = 3
    
    for i in tqdm(range(1, 79)):
        url = "http://www.squashinfo.com/results?f_type=3&start=" + str(i)
        data = get_tournament_data_from_url(url)
        df_tournaments = df_tournaments.append(pd.DataFrame(data, columns = columns))
        
    df_tournaments = df_tournaments.reset_index()
    df_tournaments['matches_downloaded'] = False
    df_tournaments.to_csv('tournaments_female.csv')
    return df_tournaments


def extract_information_from_row(row):
    """
    input a row and extract information from it
    determine whether it:
    -determines which round of the tournament it is (final, semi-final, etc.)
    -is an empty row
    -details one of the matches in the tournament
    """
    cols = row.find_all('td')
    try:
        row_class = cols[0]['class']
    except KeyError:
        return 'empty', None
    
    if row_class[0] == 'match_type':
        text = cols[0].text.strip(':')
        return 'round', text
    elif row_class[0] == 'indv_col_1':
        text = [cols[i].text.strip() for i in range(2)]
        return 'match', text
    else:
        return 'empty', None


def get_match_data_from_url(session, url):
    html = session.get(url).text
    tables = BeautifulSoup(html, 'html.parser').find_all('table')
    rows = tables[0].find_all('tr')
    
    data = []
    match_round = None
    
    for row in rows:
        row_type, text = extract_information_from_row(row)
        
        if row_type == 'empty':
            continue
        elif row_type == 'round':
            match_round = text
        elif row_type == 'match':
            players = text[0]
            result = text[1]
            data.append([match_round, players, result])
    
    return data

        

def create_frame_of_matches_from_tournament_frame(session, load_temp = False):
    df_tournament = pd.read_csv('tournaments_female.csv', index_col = 0)
#     dodgy = [2412]
    dodgy = []
    
    columns = ['tournament_index', 'round', 'players', 'result']
    columns_minus_tournament = ['round', 'players', 'result']
    
    if load_temp:
        df_matches = pd.read_csv('temp_matches.csv', index_col = 0)
    else:
        df_matches = pd.DataFrame([], columns = columns)
    
    for index in tqdm(range(1558)):
        if df_tournament.loc[index, 'matches_downloaded']:
            continue

        if index in dodgy:
            continue
        
        url = df_tournament.loc[index, 'url']
        print(f'{index}, {url}')
        
        try:
            data = get_match_data_from_url(session, url)
            df_data = pd.DataFrame(data, columns = columns_minus_tournament)
            df_data['tournament_index'] = index
            
            df_matches = df_matches.append(df_data)
            df_matches = df_matches.reset_index(drop=True)
            df_matches.to_csv('temp_matches.csv')

            df_tournament.loc[index, 'matches_downloaded'] = True
            df_tournament.to_csv('tournaments_female.csv')
        except IndexError:
            print(index)
            dodgy.append(index)

    return df_matches

In [None]:
df = create_frame_of_tournaments()

In [None]:
df = pd.read_csv('tournaments_female.csv', index_col = 0)
df

In [None]:
df.loc[df.matches_downloaded]

In [None]:
login_url = base_url+'/login'
payload = {
    'l_email': '', # manually fill in
    'l_password': '' # manually fill in
}

with requests.Session() as s:
    p = s.post(login_url, data=payload)
    df_matches = create_frame_of_matches_from_tournament_frame(s, load_temp = True)

In [None]:
matches_temp = pd.read_csv('temp_matches.csv', index_col = 0)

In [None]:
matches_temp

In [None]:
matches_temp.to_csv('womens_matches_all.csv')

## testing functions on individual urls

In [None]:
login_url = base_url+'/login'
payload = {
    'l_email': '', # manually fill in
    'l_password': '' # manually fill in
}

with requests.Session() as s:
    p = s.post(login_url, data=payload)
#     print(p.text)
    
    url = df.loc[98, 'url']
    print(url)
#     url = 'http://www.squashinfo.com/events/7520-mens-british-national-championship-2018'
    html = s.get(url).text
    tables = BeautifulSoup(html, 'html.parser').find_all('table')
#     rows = tables[0].find_all('tr')

In [None]:
print(html)

In [None]:
for row in rows:
    row_type, text = extract_information_from_row(row)
    print(f'{row_type}, {text}')

## troublesome links
2412	, 'http://www.squashinfo.com/events/1307-mens-international-tirolean-championship-1996'