In [1]:
#allow output from every line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
from pybaseball import lahman
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time

In [2]:
def scrape_pitching_for_season(season):
    url = "https://www.baseball-reference.com/leagues/MLB/" + str(season) + "-standard-pitching.shtml"
    print(url)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    all_tables = bs.find('div', {"id":"all_teams_standard_pitching"})
    all_stats_tables = pd.read_html(str(all_tables))
    teams_pitching = pd.DataFrame(all_stats_tables[0][:-3])
    teams_pitching['Season'] = season
    return teams_pitching

def scrape_batting_for_season(season):
    url = "https://www.baseball-reference.com/leagues/MLB/" + str(season) + "-standard-batting.shtml"
    print(url)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    all_tables = bs.find('div', {"id":"all_teams_standard_batting"})
    all_stats_tables = pd.read_html(str(all_tables))
    teams_batting = pd.DataFrame(all_stats_tables[0][:-3])
    teams_batting['Season'] = season
    return teams_batting

def scrape_fielding_for_season(season):
    url = "https://www.baseball-reference.com/leagues/MLB/" + str(season) + "-standard-fielding.shtml"
    print(url)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    all_tables = bs.find('div', {"id":"all_teams_standard_fielding"})
    all_stats_tables = pd.read_html(str(all_tables))
    teams_fielding = pd.DataFrame(all_stats_tables[0][:-3])
    teams_fielding['Season'] = season
    return teams_fielding

def scrape_standings_for_season(season):
    url = "https://www.baseball-reference.com/leagues/MLB/" + str(season) + "-standings.shtml"
    print(url)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(), 'lxml')
    all_west_tables = bs.find_all('div', {"id":"all_standings_W"})
    all_west_stats_tables = pd.read_html(str(all_west_tables))
    all_east_tables = bs.find_all('div', {"id":"all_standings_E"})
    all_east_stats_tables = pd.read_html(str(all_east_tables))
    all_central_tables = bs.find_all('div', {"id":"all_standings_C"})
    all_central_stats_tables = pd.read_html(str(all_central_tables))
    al_west_standings = pd.DataFrame(all_west_stats_tables[0])
    nl_west_standings = pd.DataFrame(all_west_stats_tables[1])
    al_east_standings = pd.DataFrame(all_east_stats_tables[0])
    nl_east_standings = pd.DataFrame(all_east_stats_tables[1])
    al_central_standings = pd.DataFrame(all_central_stats_tables[0])
    nl_central_standings = pd.DataFrame(all_central_stats_tables[1])
    season_standings = pd.concat([al_west_standings, nl_west_standings, al_east_standings, nl_east_standings, al_central_standings, nl_central_standings])
    season_standings['Season'] = season
    return season_standings

scrape_pitching_for_season(2019)
scrape_batting_for_season(2019)
scrape_fielding_for_season(2019)
scrape_standings_for_season(2019)

https://www.baseball-reference.com/leagues/MLB/2019-standard-pitching.shtml


Unnamed: 0,Tm,#P,PAge,RA/G,W,L,W-L%,ERA,G,GS,...,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,LOB,Season
0,ARI,27,28.6,4.59,85,77,0.525,4.25,162,162,...,106,4.4,1.308,8.6,1.4,3.2,8.8,2.77,1092,2019
1,ATL,32,27.5,4.59,97,65,0.599,4.19,162,162,...,113,4.39,1.357,8.8,1.3,3.4,8.6,2.54,1148,2019
2,BAL,39,27.3,6.06,54,108,0.333,5.59,162,162,...,83,5.56,1.459,9.6,1.9,3.5,7.8,2.22,1086,2019
3,BOS,28,29.0,5.11,84,78,0.519,4.7,162,162,...,104,4.28,1.379,8.7,1.3,3.7,10.0,2.7,1159,2019
4,CHC,33,31.1,4.43,84,78,0.519,4.1,162,162,...,106,4.25,1.325,8.6,1.2,3.3,9.0,2.7,1147,2019
5,CHW,27,27.6,5.17,72,89,0.447,4.9,161,161,...,94,4.89,1.43,9.2,1.5,3.7,8.4,2.25,1089,2019
6,CIN,26,28.2,4.39,75,87,0.463,4.18,162,162,...,114,4.23,1.256,7.9,1.3,3.4,9.7,2.9,1032,2019
7,CLE,30,28.3,4.06,93,69,0.574,3.76,162,162,...,127,4.06,1.223,8.2,1.3,2.8,9.4,3.35,1038,2019
8,COL,31,27.3,5.91,71,91,0.438,5.56,162,162,...,92,5.23,1.494,9.8,1.7,3.7,7.9,2.15,1119,2019
9,DET,32,27.8,5.68,47,114,0.292,5.24,161,161,...,90,4.84,1.459,9.8,1.6,3.4,8.6,2.55,1127,2019


https://www.baseball-reference.com/leagues/MLB/2019-standard-batting.shtml


Unnamed: 0,Tm,#Bat,BatAge,R/G,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,LOB,Season
0,ARI,45,28.7,5.02,162,6315,5633,813,1419,288,...,0.757,94,2447,120,70,31,40,36,1119,2019
1,ATL,50,28.0,5.28,162,6302,5560,855,1432,277,...,0.789,97,2514,104,60,25,35,39,1138,2019
2,BAL,58,26.5,4.5,162,6189,5596,729,1379,252,...,0.725,92,2320,111,71,22,37,8,1063,2019
3,BOS,47,27.3,5.56,162,6475,5770,901,1554,345,...,0.806,107,2688,127,49,20,44,36,1170,2019
4,CHC,52,27.7,5.02,162,6195,5461,814,1378,270,...,0.783,103,2468,127,83,30,39,33,1071,2019
5,CHW,47,27.6,4.4,161,6042,5529,708,1443,260,...,0.728,92,2289,114,66,36,32,13,1071,2019
6,CIN,47,27.8,4.33,162,6100,5450,701,1328,235,...,0.736,84,2298,111,89,30,33,25,1073,2019
7,CLE,54,27.7,4.75,162,6124,5425,769,1354,286,...,0.756,94,2345,110,50,40,46,30,1072,2019
8,COL,50,28.2,5.15,162,6288,5660,835,1502,323,...,0.782,90,2579,111,43,51,43,25,1075,2019
9,DET,53,27.6,3.61,161,6039,5549,582,1333,292,...,0.682,79,2154,108,48,9,42,14,1069,2019


https://www.baseball-reference.com/leagues/MLB/2019-standard-fielding.shtml


Unnamed: 0,Tm,#Fld,RA/G,DefEff,G,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,Rdrs,Rdrs/yr,Rgood,Season
0,ARI,44,4.59,0.692,162,1458,1005,13185.0,6042,4395,1561,86,136,0.986,49,4,112,3,9,2019
1,ATL,50,4.59,0.685,162,1458,1127,13056.0,5952,4352,1522,78,154,0.987,3,0,41,2,-2,2019
2,BAL,57,6.06,0.688,162,1458,989,12987.0,5879,4329,1442,108,155,0.982,6,1,-95,-5,-9,2019
3,BOS,47,5.11,0.673,162,1458,1106,13239.0,5937,4413,1436,88,115,0.985,-21,-2,-28,0,-8,2019
4,CHC,52,4.43,0.681,162,1458,969,12978.0,6063,4326,1619,118,141,0.981,14,1,-14,1,5,2019
5,CHW,47,5.17,0.678,161,1449,1132,12714.0,5880,4238,1525,117,171,0.98,-17,-2,-56,-2,0,2019
6,CIN,47,4.39,0.699,162,1458,972,12942.0,5865,4314,1460,91,124,0.984,16,1,58,0,9,2019
7,CLE,53,4.06,0.695,162,1458,1139,12939.0,5718,4313,1322,83,110,0.985,45,4,82,0,10,2019
8,COL,50,5.91,0.677,162,1458,1112,13038.0,6201,4346,1758,97,165,0.984,16,1,9,1,10,2019
9,DET,53,5.68,0.666,161,1449,1214,12897.0,5861,4299,1452,110,127,0.981,-96,-9,-84,-1,-8,2019


https://www.baseball-reference.com/leagues/MLB/2019-standings.shtml


Unnamed: 0,Tm,W,L,W-L%,GB,Season
0,Houston Astros,107,55,0.66,--,2019
1,Oakland Athletics,97,65,0.599,10.0,2019
2,Texas Rangers,78,84,0.481,29.0,2019
3,Los Angeles Angels,72,90,0.444,35.0,2019
4,Seattle Mariners,68,94,0.42,39.0,2019
0,Los Angeles Dodgers,106,56,0.654,--,2019
1,Arizona Diamondbacks,85,77,0.525,21.0,2019
2,San Francisco Giants,77,85,0.475,29.0,2019
3,Colorado Rockies,71,91,0.438,35.0,2019
4,San Diego Padres,70,92,0.432,36.0,2019


In [3]:
team_batting_stats = pd.DataFrame()
team_pitching_stats = pd.DataFrame()
team_fielding_stats = pd.DataFrame()
season_standings = pd.DataFrame()

seasons = np.arange(2000,2021)
for season in seasons: 
    team_batting = scrape_batting_for_season(season)
    team_fielding = scrape_fielding_for_season(season)
    team_pitching = scrape_pitching_for_season(season)
    standings = scrape_standings_for_season(season)
    team_batting_stats = pd.concat([team_batting_stats, team_batting])
    team_pitching_stats = pd.concat([team_pitching_stats, team_pitching])
    team_fielding_stats = pd.concat([team_fielding_stats, team_fielding])
    season_standings = pd.concat([season_standings, standings])
    time.sleep(np.random.uniform(2,5))


team_batting_stats
team_pitching_stats
team_fielding_stats
season_standings

https://www.baseball-reference.com/leagues/MLB/2000-standard-batting.shtml
https://www.baseball-reference.com/leagues/MLB/2000-standard-fielding.shtml
https://www.baseball-reference.com/leagues/MLB/2000-standard-pitching.shtml
https://www.baseball-reference.com/leagues/MLB/2000-standings.shtml
https://www.baseball-reference.com/leagues/MLB/2001-standard-batting.shtml
https://www.baseball-reference.com/leagues/MLB/2001-standard-fielding.shtml
https://www.baseball-reference.com/leagues/MLB/2001-standard-pitching.shtml
https://www.baseball-reference.com/leagues/MLB/2001-standings.shtml
https://www.baseball-reference.com/leagues/MLB/2002-standard-batting.shtml
https://www.baseball-reference.com/leagues/MLB/2002-standard-fielding.shtml
https://www.baseball-reference.com/leagues/MLB/2002-standard-pitching.shtml
https://www.baseball-reference.com/leagues/MLB/2002-standings.shtml
https://www.baseball-reference.com/leagues/MLB/2003-standard-batting.shtml
https://www.baseball-reference.com/leagu

Unnamed: 0,Tm,#Bat,BatAge,R/G,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,LOB,Season
0,ANA,45,27.6,5.33,162,6373,5628,864,1574,309,...,.825,105,2659,126,47,47,43,43,1173,2000
1,ARI,41,30.8,4.89,162,6241,5527,792,1466,282,...,.763,88,2373,114,59,61,58,37,1128,2000
2,ATL,47,30.8,5.00,162,6275,5489,810,1490,274,...,.775,95,2353,127,59,87,45,38,1192,2000
3,BAL,50,32.1,4.90,162,6238,5549,794,1508,310,...,.776,100,2414,148,49,27,54,34,1129,2000
4,BOS,52,29.3,4.89,162,6371,5630,792,1503,316,...,.764,90,2384,115,42,40,48,40,1226,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,STL,44,28.8,4.14,58,2011,1752,240,410,73,...,.694,92,650,38,33,4,16,3,358,2020
26,TBR,43,27.1,4.82,60,2261,1975,289,470,105,...,.753,112,839,38,28,0,14,9,416,2020
27,TEX,49,27.4,3.73,60,2147,1936,224,420,80,...,.648,75,704,33,24,2,18,3,350,2020
28,TOR,49,25.9,5.03,60,2263,2023,302,516,104,...,.766,109,892,39,12,8,14,4,392,2020


Unnamed: 0,Tm,#P,PAge,RA/G,W,L,W-L%,ERA,G,GS,...,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,LOB,Season
0,ANA,25,28.9,5.36,82,80,.506,5.00,162,162,...,101,5.46,1.517,9.5,1.4,4.1,5.3,1.28,1188,2000
1,ARI,17,31.6,4.65,85,77,.525,4.35,162,162,...,110,4.28,1.344,9.0,1.2,3.1,7.6,2.44,1073,2000
2,ATL,22,31.4,4.41,95,67,.586,4.05,162,162,...,114,4.19,1.327,8.9,1.0,3.0,6.8,2.26,1130,2000
3,BAL,22,29.1,5.64,74,88,.457,5.37,162,162,...,88,5.01,1.543,9.7,1.3,4.2,6.4,1.53,1220,2000
4,BOS,24,30.2,4.60,85,77,.525,4.23,162,162,...,120,4.29,1.329,8.9,1.1,3.1,6.9,2.25,1122,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,STL,26,27.6,3.95,30,28,.517,3.90,58,58,...,110,4.58,1.226,7.2,1.3,3.9,8.8,2.27,341,2020
26,TBR,25,28.1,3.82,40,20,.667,3.56,60,60,...,116,3.94,1.219,8.1,1.2,2.9,9.4,3.29,400,2020
27,TEX,25,28.7,5.20,22,38,.367,5.02,60,60,...,94,4.88,1.384,8.3,1.4,4.1,8.5,2.07,411,2020
28,TOR,29,29.5,5.20,32,28,.533,4.60,60,60,...,95,4.73,1.462,8.9,1.4,4.3,8.9,2.08,423,2020


Unnamed: 0,Tm,#Fld,RA/G,DefEff,G,GS,CG,Inn,Ch,PO,A,E,DP,Fld%,Rtot,Rtot/yr,Season,Rdrs,Rdrs/yr,Rgood
0,ANA,45,5.36,.699,162,1458,1143,13032.0,6224,4344,1746,134,182,.978,71,7,2000,,,
1,ARI,41,4.65,.687,162,1458,1132,12993.0,5979,4331,1541,107,138,.982,17,2,2000,,,
2,ATL,46,4.41,.692,162,1458,1114,12963.0,6183,4321,1733,129,138,.979,36,3,2000,,,
3,BAL,48,5.64,.684,162,1458,1186,12900.0,5994,4300,1578,116,151,.981,-21,-2,2000,,,
4,BOS,50,4.60,.696,162,1458,1071,13074.0,6114,4358,1647,109,120,.982,38,3,2000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,STL,44,3.95,.731,58,522,374,4257.0,1941,1419,489,33,46,.983,28,8,2020,33,1,3
26,TBR,43,3.82,.693,60,540,355,4749.0,2154,1584,537,33,52,.985,13,3,2020,24,2,-1
27,TEX,49,5.20,.704,60,540,430,4650.0,2057,1550,467,40,40,.981,13,3,2020,11,0,1
28,TOR,48,5.20,.681,60,540,396,4722.0,2148,1575,534,39,47,.982,-11,-3,2020,-39,-1,1


Unnamed: 0,Tm,W,L,W-L%,GB,Season
0,Oakland Athletics,91,70,0.565,--,2000
1,Seattle Mariners,91,71,0.562,0.5,2000
2,Anaheim Angels,82,80,0.506,9.5,2000
3,Texas Rangers,71,91,0.438,20.5,2000
0,San Francisco Giants,97,65,0.599,--,2000
...,...,...,...,...,...,...
0,Chicago Cubs,34,26,0.567,--,2020
1,St. Louis Cardinals,30,28,0.517,3.0,2020
2,Cincinnati Reds,31,29,0.517,3.0,2020
3,Milwaukee Brewers,29,31,0.483,5.0,2020


In [None]:
season_standings.loc[season_standings['Tm'] == "Anaheim Angels", "Tm_Abbrev"] = "ANA"
season_standings.loc[season_standings['Tm'] == "Los Angeles Angels", "Tm_Abbrev"] = "LAA"
season_standings.loc[season_standings['Tm'] == "Arizona Diamondbacks", "Tm_Abbrev"] = "ARI"
season_standings.loc[season_standings['Tm'] == "Atlanta Braves", "Tm_Abbrev"] = "ATL"
season_standings.loc[season_standings['Tm'] == "Baltimore Orioles", "Tm_Abbrev"] = "BAL"
season_standings.loc[season_standings['Tm'] == "Boston Red Sox", "Tm_Abbrev"] = "BOS"
season_standings.loc[season_standings['Tm'] == "Chicago White Sox", "Tm_Abbrev"] = "CHW"
season_standings.loc[season_standings['Tm'] == "Chicago Cubs", "Tm_Abbrev"] = "CHC"
season_standings.loc[season_standings['Tm'] == "Cincinnati Reds", "Tm_Abbrev"] = "CIN"
season_standings.loc[season_standings['Tm'] == "Cleveland Indians", "Tm_Abbrev"] = "CLE"
season_standings.loc[season_standings['Tm'] == "Colorado Rockies", "Tm_Abbrev"] = "COL"
season_standings.loc[season_standings['Tm'] == "Detroit Tigers", "Tm_Abbrev"] = "DET"
season_standings.loc[season_standings['Tm'] == "Houston Astros", "Tm_Abbrev"] = "HOU"
season_standings.loc[season_standings['Tm'] == "Kansas City Royals", "Tm_Abbrev"] = "KCR"
season_standings.loc[season_standings['Tm'] == "Los Angeles Dodgers", "Tm_Abbrev"] = "LAD"
season_standings.loc[season_standings['Tm'] == "Miami Marlins", "Tm_Abbrev"] = "MIA"
season_standings.loc[season_standings['Tm'] == "Milwaukee Brewers", "Tm_Abbrev"] = "MIL"
season_standings.loc[season_standings['Tm'] == "Minnesota Twins", "Tm_Abbrev"] = "MIN"
season_standings.loc[season_standings['Tm'] == "New York Yankees", "Tm_Abbrev"] = "NYY"
season_standings.loc[season_standings['Tm'] == "New York Mets", "Tm_Abbrev"] = "NYM"
season_standings.loc[season_standings['Tm'] == "Oakland Athletics", "Tm_Abbrev"] = "OAK"
season_standings.loc[season_standings['Tm'] == "Philadelphia Phillies", "Tm_Abbrev"] = "PHI"
season_standings.loc[season_standings['Tm'] == "Pittsburgh Pirates", "Tm_Abbrev"] = "PIT"
season_standings.loc[season_standings['Tm'] == "San Diego Padres", "Tm_Abbrev"] = "SDP"
season_standings.loc[season_standings['Tm'] == "San Francisco Giants", "Tm_Abbrev"] = "SFG"
season_standings.loc[season_standings['Tm'] == "Seattle Mariners", "Tm_Abbrev"] = "SEA"
season_standings.loc[season_standings['Tm'] == "St. Louis Cardinals", "Tm_Abbrev"] = "STL"
season_standings.loc[season_standings['Tm'] == "Tampa Bay Rays", "Tm_Abbrev"] = "TBR"
season_standings.loc[season_standings['Tm'] == "Texas Rangers", "Tm_Abbrev"] = "TEX"
season_standings.loc[season_standings['Tm'] == "Toronto Blue Jays", "Tm_Abbrev"] = "TOR"
season_standings.loc[season_standings['Tm'] == "Washington Nationals", "Tm_Abbrev"] = "WSN"
season_standings.loc[season_standings['Tm'] == "Montreal Expos", "Tm_Abbrev"] = "MON"
season_standings.loc[season_standings['Tm'] == "Florida Marlins", "Tm_Abbrev"] = "FLA"
season_standings.loc[season_standings['Tm'] == "Tampa Bay Devil Rays", "Tm_Abbrev"] = "TBD"
season_standings.loc[season_standings['Tm'] == "Los Angeles Angels of Anaheim", "Tm_Abbrev"] = "LAA"
season_standings


In [10]:
batting_pitching = pd.merge(team_batting_stats, team_pitching_stats, on = ['Tm', 'Season'], how = "inner", suffixes = ["_Batting", "_Pitching"])
bp_fielding = pd.merge(batting_pitching, team_fielding_stats, on = ['Tm', 'Season'], how = "inner", suffixes = ["", "_Fielding"])
all_stats = pd.merge(bp_fielding, season_standings, left_on = ['Tm', 'Season'], right_on = ['Tm_Abbrev', 'Season'], how = "inner", suffixes = ["", "_Standings"])

all_stats

Unnamed: 0,Tm,#Bat,BatAge,R/G,G_Batting,PA,AB,R_Batting,H_Batting,2B,...,Rtot/yr,Rdrs,Rdrs/yr,Rgood,Tm_Standings,W_Standings,L_Standings,W-L%_Standings,GB,Tm_Abbrev
0,ANA,45,27.6,5.33,162,6373,5628,864,1574,309,...,7,,,,Anaheim Angels,82,80,0.506,9.5,ANA
1,ARI,41,30.8,4.89,162,6241,5527,792,1466,282,...,2,,,,Arizona Diamondbacks,85,77,0.525,12.0,ARI
2,ATL,47,30.8,5.00,162,6275,5489,810,1490,274,...,3,,,,Atlanta Braves,95,67,0.586,--,ATL
3,BAL,50,32.1,4.90,162,6238,5549,794,1508,310,...,-2,,,,Baltimore Orioles,74,88,0.457,13.5,BAL
4,BOS,52,29.3,4.89,162,6371,5630,792,1503,316,...,3,,,,Boston Red Sox,85,77,0.525,2.5,BOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625,STL,44,28.8,4.14,58,2011,1752,240,410,73,...,8,33,1,3,St. Louis Cardinals,30,28,0.517,3.0,STL
626,TBR,43,27.1,4.82,60,2261,1975,289,470,105,...,3,24,2,-1,Tampa Bay Rays,40,20,0.667,--,TBR
627,TEX,49,27.4,3.73,60,2147,1936,224,420,80,...,3,11,0,1,Texas Rangers,22,38,0.367,14.0,TEX
628,TOR,49,25.9,5.03,60,2263,2023,302,516,104,...,-3,-39,-1,1,Toronto Blue Jays,32,28,0.533,8.0,TOR


In [12]:
all_stats.to_csv("SAL384FinalData.csv", index = false)