# Extraction and Transformation

### This file contains data obtained from Stathead Baseball obtained via webscraping. The main metric that we are interested in is the Wins Above Replacement (WAR) metric for each player, as well as other identifiers such as birth location, years active, and position.

In [None]:
#import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from pprint import pprint

In [None]:
#web scraping with splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
url = """
    https://stathead.com/baseball/player-batting-season-finder.cgi?request=1&match=player_season_combined&order_by_asc=0
    &order_by=b_war&year_min=1900&comp_type=reg&exactness=anymarked&games_min_max=min&minpasVal=502&mingamesVal=100
    &season_start=1&season_end=-1&weight_min=0&weight_max=500&location=pob&locationMatch=is&date_type=dob&date_comp=%3D&month_val=0&day_val=0&year_val=0&num_franchises_comp=gt&all_stars_comp=gt"
    """
browser.visit(url)

In [None]:
#web scrape #1: get three letter country codes and store them into an array which we will use as a parameter for another web scrape
country_codes = []
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
presets = soup.find('select', class_='sr_preset pob')
options = presets.find('option')
options2 = options.find_next_sibling("option")

#add each unique country code to an array
for i in range(56):
    if options2["value"] not in country_codes:
        country_codes.append(options2["value"])
        options2 = options2.find_next_sibling("option")
        
#list of country codes    
print(country_codes)

In [None]:
#dictionary to hold player information as keys
main_dict = {}

#columns we are interested in
columns_we_want = ['ranker', 'name_display', 'b_war', 'year_min', 'year_max', 'age_range', 'birth_location']
count = -1

#for loop that web scrapes information based on country code
#when web scraping for player data, make sure to log in with your subscribed account on the browser window opened with splinter
#or else you are limited to 20 entries for each country code and the first 10 won't have any data
for country in country_codes:
    country_code = country
    unique_url = f"https://stathead.com/baseball/player-batting-season-finder.cgi?request=1&match=player_season_combined&order_by_asc=0&order_by=b_war&comp_type=reg&exactness=anymarked&games_min_max=min&minpasVal=502&mingamesVal=100&season_start=1&season_end=-1&weight_min=0&weight_max=500&location=pob&locationMatch=is&pob={country_code}&date_type=dob&date_comp=%3D&month_val=0&day_val=0&year_val=0&num_franchises_comp=gt&all_stars_comp=gt"
    browser.visit(unique_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    tbody = soup.find('tbody')
    trs = tbody.find('tr')
    #for loop that returns the information we want and stores it into an array called "player_list" for each player
    for a in tbody:
        count += 1
        player_list = [] #creates empty array to hold player information for each iteration
        player_list.append(count) #Assigns a player_id
        for row in trs:
            if row["data-stat"] in columns_we_want:
                for i in row:
                    entry = i.text
                    if entry not in player_list:
                        player_list.append(entry)
        player_list.append(country_code) #adds country to end of player_list array
        player_list.append("Hitter") #Position
        main_dict[player_list[0]] = player_list #adds that array to main_dict with player name as a key
        trs = trs.find_next_sibling('tr') #sets trs to the next data row for web scraping
        if trs == None:
            break;
        
        #skips rows that aren't data rows (such as header rows)
        if trs.find('th')["class"] not in [["left"],["right"],["center"]]:
            trs = trs.find_next_sibling('tr')

In [None]:
#dictionary to hold player information as keys
pitcher_dict = {}
columns_we_want = ['ranker', 'name_display', 'p_war', 'year_min', 'year_max', 'age_range', 'birth_location'] #Pitchers use p_war instead of b_war

#for loop that web scrapes information based on country code
#when web scraping for player data, make sure to log in with your subscribed account on the browser window opened with splinter
#or else you are limited to 20 entries for each country code and the first 10 won't have any data
for country in country_codes:
    country_code = country
    pitcher_url = f"https://stathead.com/baseball/player-pitching-season-finder.cgi?request=1&match=player_season_combined&order_by_asc=0&order_by=p_war&comp_type=reg&games_started=60&games_relieved=80&qualifiers=nomin&minIpVal=162&minDecVal=14&mingamesVal=40&p_g=x&season_start=1&season_end=-1&weight_min=0&weight_max=500&location=pob&locationMatch=is&pob={country_code}&date_type=dob&date_comp=%3D&month_val=0&day_val=0&year_val=0&num_franchises_comp=gt&all_stars_comp=gt"
    browser.visit(pitcher_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    if soup.find('tbody') == None: #Some countries contain zero records
        continue;
    tbody = soup.find('tbody')
    trs = tbody.find('tr')
    #for loop that returns the information we want and stores it into an array called "player_list" for each player
    for a in tbody:
        count += 1
        player_list = [] #creates empty array to hold player information for each iteration
        player_list.append(count) #Assigns a player_id
        for row in trs:
            if row["data-stat"] in columns_we_want:
                for i in row:
                    entry = i.text
                    if entry not in player_list:
                        player_list.append(entry)
        player_list.append(country_code) #adds country to end of player_list array
        player_list.append("Pitcher") #Position
        pitcher_dict[player_list[0]] = player_list #adds that array to main_dict with player name as a key
        trs = trs.find_next_sibling('tr') #sets trs to the next data row for web scraping
        if trs == None:
            break;
        
        #skips rows that aren't data rows (such as header rows)
        if trs.find('th')["class"] not in [["left"],["right"],["center"]]:
            trs = trs.find_next_sibling('tr')

In [None]:
#create a dataframe to store our collected information in
baseball_df = pd.DataFrame(columns=["player_id", "rank_in_country", "player_name", "war", "year_min", "year_max", "age_range", "birth_location", "country", "position"])
#add hitter information to dataframe
for player in main_dict:
    #some players don't have a birth location recorded, so this checks for invalid entries
    if len(main_dict[player]) == 10:
        baseball_df.loc[len(baseball_df.index)] = main_dict[player]

#add pitcher information to our main dataframe
for player in pitcher_dict:
    #some players don't have a birth location recorded, so this checks for invalid entries
    if len(pitcher_dict[player]) == 10:
        baseball_df.loc[len(baseball_df.index)] = pitcher_dict[player]

In [None]:
#Set player_id as index
baseball_df = baseball_df.set_index("player_id")
baseball_df

In [None]:
baseball_df.to_csv("wardata.csv")