# Scraping for Go Columbia Go Website

In [7]:
from bs4 import BeautifulSoup
import os
import json
import requests

from pymongo import MongoClient
from dotenv import load_dotenv, find_dotenv
from pprint import pprint

In [8]:
# Load environment variables
load_dotenv(os.getenv('MDB_PASSWORD'))

# Connect to MongoDB
MDB_USERNAME = os.getenv('MDB_USERNAME')
MDB_PASSWORD = os.getenv('MDB_PASSWORD')

MDB_URI = f'mongodb+srv://{MDB_USERNAME}:{MDB_PASSWORD}@goco-scraping.bwqwr.mongodb.net/goco?retryWrites=true&w=majority'
client = MongoClient(MDB_URI)

golf_db = client["golf"]
golf_roster = golf_db["roster"]

mydict = { "name": "John", "address": "Highway 37" }
x = golf_roster.insert_one(mydict)


# Roster Scraping

In [94]:
def get_roster_data(sport):
    URL = 'https://gocolumbialions.com/sports/{}/roster'.format(sport)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    res = soup.find('script', type='application/ld+json')
    athletes = json.loads(res.contents[0])
    athletes_info = []
    for athlete in athletes['item']:
        athlete_info = {
            "name": athlete['name'],
            "gender": athlete['gender'],
            "id": int(athlete['url'].split("=")[1])
        }

        if athlete['url']:
            profile_url = 'https://gocolumbialions.com/sports/{}/roster/{}/{}'.format(sport, athlete_info["name"].lower().replace(' ', '-'), athlete_info['id'])
            athlete_page = requests.get(profile_url)
            soup = BeautifulSoup(athlete_page.content, 'html.parser')
            # sports/mens-golf/roster/daniel-core/13879
            player_fields = soup.find("div", class_="sidearm-roster-player-fields").find_all("li")
            for field in player_fields:
                athlete_info[field.find("dt").text.lower()] = field.find("dd").text.lower()

        if athlete['image']:
            athlete_info['image_url'] = athlete['image']['url']

        athletes_info.append(athlete_info)
    return athletes_info

In [95]:
print(get_roster_data("mens-golf"))

[{'name': 'Daniel Core', 'gender': 'M', 'id': 13879, 'class': 'junior', 'hometown': 'sorrento, fla.', 'high school': 'circle christian h.s.', 'school': 'enrolled at columbia college', 'major': 'economics', 'image_url': 'https://gocolumbialions.com/images/2019/9/20/MGOLF_CoreDaniel.JPG'}, {'name': 'DJ Francey', 'gender': 'M', 'id': 13888, 'height': '6-1', 'class': 'first year', 'hometown': 'weston, fla.', 'high school': 'st. thomas aquinas h.s.', 'school': 'enrolled at columbia college', 'major': 'enrolled at columbia college'}, {'name': 'Derek Kim', 'gender': 'M', 'id': 13882, 'class': 'junior', 'hometown': 'irvine, calif.', 'high school': 'northwood h.s.', 'school': 'enrolled at columbia college', 'major': 'film studies', 'image_url': 'https://gocolumbialions.com/images/2019/9/20/MGOLF_KimDerek.JPG'}, {'name': 'Arjun Puri', 'gender': 'M', 'id': 13885, 'class': 'senior', 'hometown': 'new delhi, india', 'high school': 'heritage academy, s.c.', 'school': 'enrolled at columbia college', '

# Coaches Scraping

# Schedule Scraping

# Statistics Scraping