# Scraping for Go Columbia Go Website

In [1]:
from bs4 import BeautifulSoup
import os
import json
import requests

from pymongo import MongoClient
from dotenv import load_dotenv, find_dotenv
from pprint import pprint

In [2]:
# Load environment variables
load_dotenv(os.getenv('MDB_PASSWORD'))

# Connect to MongoDB
MDB_USERNAME = os.getenv('MDB_USERNAME')
MDB_PASSWORD = os.getenv('MDB_PASSWORD')

MDB_URI = f'mongodb+srv://{MDB_USERNAME}:{MDB_PASSWORD}@goco-scraping.bwqwr.mongodb.net/goco?retryWrites=true&w=majority'
client = MongoClient(MDB_URI)

golf_db = client["golf"]
golf_roster = golf_db["roster"]

mydict = { "name": "John", "address": "Highway 37" }
x = golf_roster.insert_one(mydict)


# Roster Scraping

In [88]:
def get_avaliable_years(soup, num_years=3):
    options = soup.find(id="ddl_past_rosters").text.replace("  ", "").replace("\r", "").split('\n')
    options = [option for option in options if option][:num_years]
    return [year.split(" ")[0] for year in options]

def get_athlete_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    res = soup.find('script', type='application/ld+json')
    return json.loads(res.contents[0])

def get_profile_soup(sport, name, id):
    profile_url = 'https://gocolumbialions.com/sports/{}/roster/{}/{}'.format(sport, name.lower().replace(' ', '-'), id)
    athlete_page = requests.get(profile_url)
    return BeautifulSoup(athlete_page.content, 'html.parser')
    
def get_athlete_player_fields(soup):
    # find player field section and get field data
    player_fields = soup.find("div", class_="sidearm-roster-player-fields")
    return player_fields.find_all("li")

def get_athlete_active_years(current_year, soup):
    player_active_years = soup.find_all("span", class_="sidearm-roster-player-first-name")
    years = [year.text for year in player_active_years]
    return years if len(years) > 0 else [current_year]
    

In [101]:
def get_roster_data(sport, specified_year=None):
    URL = 'https://gocolumbialions.com/sports/{}/roster'.format(sport)
    page = requests.get(URL)
    
    # get page content in soup object
    soup = BeautifulSoup(page.content, 'html.parser')
    years = get_avaliable_years(soup) if not specified_year else [specified_year]
    athletes_info = []
    seen_names = {}
    
    for year in years:
        # get url of each year/season
        year_data_url = '{}/{}'.format(URL, year)
        athletes = get_athlete_data(year_data_url)

        for athlete in athletes['item']:
            # make sure we aren't repeating players
            if athlete['name'] not in seen_names:
                athlete_info = {
                    "name": athlete['name'],
                    "gender": athlete['gender'],
                    "id": int(athlete['url'].split("=")[1])
                }

                # add in player fields from their profile page
                if athlete['url']:
                    profile_soup = get_profile_soup(sport, athlete_info['name'], athlete_info['id'])
                    athlete_info["active_years"] = get_athlete_active_years(year, profile_soup)
                    player_fields = get_athlete_player_fields(profile_soup)
                    for field in player_fields:
                        athlete_info[field.find("dt").text.lower()] = field.find("dd").text.lower()

                # add image if avaliable
                if athlete['image']:
                    athlete_info['image_url'] = athlete['image']['url']
                    
                athletes_info.append(athlete_info)
                seen_names[athlete_info['name']] = 1
                
    return athletes_info

In [102]:
print(get_roster_data("mens-basketball"))

[{'name': 'Luke Bolster', 'gender': 'M', 'id': 13486, 'active_years': ['2017-18', '2018-19', '2019-20', '2020-21'], 'position': 'guard', 'height': '6-0', 'class': 'senior', 'weight': '175', 'hometown': 'new york, n.y.', 'high school': 'trinity school', 'school': 'columbia college', 'major': 'american studies'}, {'name': 'Ben Milstein', 'gender': 'M', 'id': 13490, 'active_years': ['2018-19', '2019-20', '2020-21'], 'position': 'guard', 'height': '5-10', 'class': 'junior', 'weight': '168', 'hometown': 'boca raton, fla.', 'high school': 'saint andrews school', 'school': 'columbia college', 'major': 'political science'}, {'name': 'Zavian McLean', 'gender': 'M', 'id': 14079, 'active_years': ['2020-21'], 'height': '6-4', 'class': 'first year', 'weight': '193', 'hometown': 'spring lake, n.c.', 'high school': 'village christian academy', 'school': 'columbia college'}, {'name': 'Liam Murphy', 'gender': 'M', 'id': 14080, 'active_years': ['2020-21'], 'height': '6-7', 'class': 'first year', 'weight

# Coaches Scraping

# Schedule Scraping

# Statistics Scraping