# Scraping for Go Columbia Go Website

In [3]:
from bs4 import BeautifulSoup
import os
import json
import requests

from pymongo import MongoClient
from dotenv import load_dotenv, find_dotenv
from pprint import pprint

In [None]:
# Load environment variables
load_dotenv(os.getenv('MDB_PASSWORD'))

# Connect to MongoDB
MDB_USERNAME = os.getenv('MDB_USERNAME')
MDB_PASSWORD = os.getenv('MDB_PASSWORD')

MDB_URI = f'mongodb+srv://{MDB_USERNAME}:{MDB_PASSWORD}@goco-scraping.bwqwr.mongodb.net/goco?retryWrites=true&w=majority'
client = MongoClient(MDB_URI)

golf_db = client["golf"]
golf_roster = golf_db["roster"]

mydict = { "name": "John", "address": "Highway 37" }
x = golf_roster.insert_one(mydict)

# Roster Scraping

In [26]:
def get_avaliable_years(soup, num_years=3):
    options = soup.find(id="ddl_past_rosters").text.replace("  ", "").replace("\r", "").split('\n')
    options = [option for option in options if option][:num_years]
    return [year.split(" ")[0] for year in options]

def get_athlete_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    res = soup.find('script', type='application/ld+json')
    return json.loads(res.contents[0])

def get_profile_soup(sport, name, id):
    profile_url = 'https://gocolumbialions.com/sports/{}/roster/{}/{}'.format(sport, name.lower().replace(' ', '-'), id)
    athlete_page = requests.get(profile_url)
    return BeautifulSoup(athlete_page.content, 'html.parser')
    
def get_athlete_player_fields(soup):
    # find player field section and get field data
    player_fields = soup.find("div", class_="sidearm-roster-player-fields")
    return player_fields.find_all("li")

def get_athlete_active_years(current_year, soup):
    player_active_years = soup.find_all("span", class_="sidearm-roster-player-first-name")
    years = [year.text for year in player_active_years]
    return years if len(years) > 0 else [current_year]
    

In [27]:
def get_roster_data(sport, specified_year=None):
    URL = 'https://gocolumbialions.com/sports/{}/roster'.format(sport)
    page = requests.get(URL)
    
    # get page content in soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    years = get_avaliable_years(soup) if not specified_year else [specified_year]
    athletes_info = []
    seen_names = {}
    
    for year in years:
        # get url of each year/season
        year_data_url = '{}/{}'.format(URL, year)
        athletes = get_athlete_data(year_data_url)

        for athlete in athletes['item']:
            # make sure we aren't repeating players
            if athlete['name'] not in seen_names:
                athlete_info = {
                    "name": athlete['name'],
                    "gender": athlete['gender'],
                    "id": int(athlete['url'].split("=")[1])
                }

                # add in player fields from their profile page
                if athlete['url']:
                    profile_soup = get_profile_soup(sport, athlete_info['name'], athlete_info['id'])
                    athlete_info["active_years"] = get_athlete_active_years(year, profile_soup)
                    player_fields = get_athlete_player_fields(profile_soup)
                    for field in player_fields:
                        athlete_info[field.find("dt").text.lower()] = field.find("dd").text.lower()

                # add image if avaliable
                if athlete['image']:
                    athlete_info['image_url'] = athlete['image']['url']
                    
                athletes_info.append(athlete_info)
                seen_names[athlete_info['name']] = 1
                
    return athletes_info

In [102]:
print(get_roster_data("mens-basketball"))

[{'name': 'Luke Bolster', 'gender': 'M', 'id': 13486, 'active_years': ['2017-18', '2018-19', '2019-20', '2020-21'], 'position': 'guard', 'height': '6-0', 'class': 'senior', 'weight': '175', 'hometown': 'new york, n.y.', 'high school': 'trinity school', 'school': 'columbia college', 'major': 'american studies'}, {'name': 'Ben Milstein', 'gender': 'M', 'id': 13490, 'active_years': ['2018-19', '2019-20', '2020-21'], 'position': 'guard', 'height': '5-10', 'class': 'junior', 'weight': '168', 'hometown': 'boca raton, fla.', 'high school': 'saint andrews school', 'school': 'columbia college', 'major': 'political science'}, {'name': 'Zavian McLean', 'gender': 'M', 'id': 14079, 'active_years': ['2020-21'], 'height': '6-4', 'class': 'first year', 'weight': '193', 'hometown': 'spring lake, n.c.', 'high school': 'village christian academy', 'school': 'columbia college'}, {'name': 'Liam Murphy', 'gender': 'M', 'id': 14080, 'active_years': ['2020-21'], 'height': '6-7', 'class': 'first year', 'weight

# Coaches Scraping

# Schedule Scraping

In [4]:
from bs4 import BeautifulSoup
import requests

def get_years(soup, num_years=3):
    #starts at 2022 -- is that ok?
    options = soup.find(id="sidearm-schedule-select-season").text.replace("  ", "").replace("\r", "").split('\n')
    options = [option for option in options if option][:num_years]
    return [year.split(" ")[0] for year in options]

def clean_text(s):
    if s==None: return(None)
    s = s.replace('\n', "").replace('\r', "").strip()
    words = s.split(" ")
    result = ""
    notFirst = False
    # remove everything after and including 'on'
    for word in words:
        if word!="on":
            if notFirst:
                result+=" "
            result+=word
            notFirst = True
        else:
            break
    return(result)

def get_games(url, year):
    # info included: Year, Opponent, Date, Home/Away, Location, Result
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    games = soup.find_all('li', class_="sidearm-schedule-game")
    games_in_year = []
    for game in games:
        game_info = {"Year": year}
        # opponent
        opp = game.find('div', class_="sidearm-schedule-game-opponent-name")
        if opp!=None:
            a = opp.find("a")
            if a!=None:
                game_info["Opponent/Event"] = clean_text(a["aria-label"])
            else:
                game_info["Opponent/Event"] = clean_text(opp.text)
        else:
            game_info["Opponent/Event"] = ""
        # date
        date = game.find('div', class_="sidearm-schedule-game-opponent-date")
        game_info["Date"] = date.find("span").text if date!=None else ""
        # home or away
        vs_at = game.find('span', class_="sidearm-schedule-game-conference-vs")
        if vs_at!=None:
            found = False
            a = vs_at.find('span', class_="sidearm-schedule-game-away")
            if a!=None:
                game_info["Home/Away"] = "Away"
                found = True
            a = vs_at.find('span', class_="sidearm-schedule-game-home")
            if a!=None:
                game_info["Home/Away"] = "Home"
                found = True
            if not found:
                game_info["Home/Away"] = ""
        else:
            game_info["Home/Away"] = ""
        # location
        location = game.find('div', class_="sidearm-schedule-game-location")
        game_info["Location"] = location.find("span").text if location!=None else ""
        # result
        result = game.find('div', class_="sidearm-schedule-game-result")
        if result!=None:   
            result_line = ""
            for line in result.find_all('span'):
                if(line.text!=None):
                    result_line += line.text
            game_info["Result"] = result_line
        else:
            game_info["Result"] = ""
        games_in_year.append(game_info)
    return games_in_year

In [5]:
def get_schedule_data(sport, specified_year=None):
    URL = 'https://gocolumbialions.com/sports/{}/schedule/'.format(sport)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    years = get_years(soup) if not specified_year else [specified_year]
    all_games = []
    for year in years:
        games = get_games('https://gocolumbialions.com/sports/{}/schedule/{}'.format(sport, year), year)
        all_games+=games
    return all_games

In [6]:
print(get_schedule_data("mens-rowing"))

[{'Year': '2020', 'Opponent/Event': '55th Head of the Charles', 'Date': 'Oct 19 (Sat)', 'Home/Away': '', 'Location': 'Cambridge, Mass.', 'Result': ''}, {'Year': '2020', 'Opponent/Event': '55th Head of the Charles', 'Date': 'Oct 20 (Sun)', 'Home/Away': '', 'Location': 'Cambridge, Mass.', 'Result': ''}, {'Year': '2020', 'Opponent/Event': 'Princeton Chase', 'Date': 'Oct 27 (Sun)', 'Home/Away': 'Away', 'Location': 'Princeton, N.J.', 'Result': "V8+: 'B': 24th; 'C': 34th; 'A': 35th"}, {'Year': '2020', 'Opponent/Event': 'Dartmouth', 'Date': 'Apr 4 (Sat)', 'Home/Away': 'Home', 'Location': 'Leonia, N.J.', 'Result': 'Canceled'}, {'Year': '2020', 'Opponent/Event': 'Holy Cross', 'Date': 'Apr 4 (Sat)', 'Home/Away': 'Home', 'Location': 'Leonia, N.J.', 'Result': 'Canceled'}, {'Year': '2020', 'Opponent/Event': 'MIT', 'Date': 'Apr 4 (Sat)', 'Home/Away': 'Home', 'Location': 'Leonia, N.J.', 'Result': 'Canceled'}, {'Year': '2020', 'Opponent/Event': 'Penn', 'Date': 'Apr 11 (Sat)', 'Home/Away': 'Away', 'Loc

# Statistics Scraping

In [86]:
from bs4 import BeautifulSoup

def get_table_headers(table):
    """Given a table soup, returns all the headers"""
    headers = []
    for th in table.find("tr").find_all("th"):
        headers.append(th.text.strip().split("\n")[0])
    return headers


def get_stat_data(sport, specified_year=None):
    URL = 'https://gocolumbialions.com/sports/{}/stats'.format(sport)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.findAll("table")
    for table in tables:
        print("\n\n\n\n NEW TABLE \n\n\n\n")
        print(get_table_headers(table))
        output_rows = []
        for table_row in table.findAll('tr'):
            sub_headers=[]
            for th in table_row.find_all("th"):
                sub_headers.append(th.text.strip())
            #print(sub_headers)

            columns = table_row.findAll('td')
            if len(columns)!=0:
                output_row = []
                for column in columns:
                    output_row.append(column.text)
                output_rows.append(output_row)
                #print(output_row)

                output_row = sub_headers + output_row
                print(output_row)

In [87]:
get_stat_data('mens-soccer')





 NEW TABLE 




['Statistic', 'Columbia', 'Opponents']
['Goals\nG', '23', '32']
['Goals Per Game\nG/AVG', '1.35', '1.88']
['Shots\nSH', '184', '222']
['Shots Per Game\nSH/AVG', '10.8', '13.1']
['Shots Percentage\nSH%', '0.125', '0.144']
['Shots On Goal\nSOG', '84-184', '94-222']
['Shots On Goal Percentage\nSOG%', '0.457', '0.423']
['Yellow Cards\nYC', '17', '17']
['Red Cards\nRC', '2', '0']
['Assists\nA', '21', '25']
['Saves\nS', '57', '62']
['Fouls\nF', '192', '185']
['Corner Kicks\nCK', '61', '67']
['Penalty Kicks: Goals-Attempts\nPG-PA', '0-1', '0-3']




 NEW TABLE 




['#', 'Player', 'GP', 'GS', 'G', 'A', 'PTS', 'SH', 'SH%', 'SOG', 'SOG%', 'YC-RC', 'GW', 'PG-PA', 'Bio Link']
['Denis, John\n10Denis, John', '10', '16', '14', '7', '4', '18', '41', '0.171', '18', '0.439', '1-0', '2', '0-1', 'View Bio']
['Zeitz, Uri\n7Zeitz, Uri', '7', '17', '11', '5', '4', '14', '39', '0.128', '22', '0.564', '1-0', '1', '0-0', 'View Bio']
['Gunbeyi, Sebastian\n15Gunbeyi, Sebastian', '15', '17', '

In [88]:
def get_team_stats(sport, specified_year=None):
    URL = 'https://gocolumbialions.com/sports/{}/stats'.format(sport)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tables = soup.findAll("table")
    table_stats = {}
    for table in tables:
        if table.find("caption", text = "Overall Team Statistics")!=None:
            headers = get_table_headers(table)
            current_header = ""
            for table_row in table.findAll('tr'):
                columns = table_row.findAll('td')
                sub_header = table_row.find("th").text.strip().split("\n")[0]
                if len(columns) == 0:
                    table_stats[sub_header] = {}
                    current_header = sub_header
                else:
                    table_stats[current_header][sub_header] = {}
                    for i in range(len(columns)):
                        table_stats[current_header][sub_header][headers[i+1]] = columns[i].text
    return(table_stats)

In [89]:
print(get_team_stats('mens-basketball'))

{'Statistic': {}, 'Scoring': {'Total Points': {'Columbia': '1988', 'Opponents': '2181'}, 'Points Per Game': {'Columbia': '66.3', 'Opponents': '72.7'}, 'Scoring Margin': {'Columbia': '-6.4', 'Opponents': '--'}}, 'Shooting': {'FG: Made-Attempted': {'Columbia': '744-1819', 'Opponents': '820-1818'}, 'FG: Percentage': {'Columbia': '.409', 'Opponents': '.451'}, 'FG: Per Game': {'Columbia': '24.8', 'Opponents': '27.3'}, '3PT: Made-Attempted': {'Columbia': '202-671', 'Opponents': '236-712'}, '3PT: Percentage': {'Columbia': '.301', 'Opponents': '.331'}, '3PT: Per Game': {'Columbia': '6.7', 'Opponents': '7.9'}, 'FT: Made-Attempted': {'Columbia': '298-408', 'Opponents': '305-452'}, 'FT: Percentage': {'Columbia': '.730', 'Opponents': '.675'}, 'FT: Per Game': {'Columbia': '9.9', 'Opponents': '10.2'}}, 'Rebounding': {'Total': {'Columbia': '1031', 'Opponents': '1158'}, 'Per Game': {'Columbia': '34.4', 'Opponents': '38.6'}, 'Margin': {'Columbia': '-4.2', 'Opponents': '--'}}, 'Assists': {'Total': {'Col