# Ballpark Factor

Load the necessary packages

In [145]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings

the team names and numbers

In [146]:
teams =["Bethel", "Goshen", "Grace", "HU", "IWU", "Marian", "MVNU", "SAU", "SFU", "Taylor"]
t_nums = [1629, 1678, 1679, 1688, 1694, 1717, 1736, 1780, 1805, 1784]

Scrape the webpage for each team.

In [147]:
urls = ['http://www.dakstats.com/WebSync/Pages/Team/TeamSchedule.aspx?association=10&sg=MBA&sea=NAIMBA_2019&team=' +
        str(num) for num in t_nums]
#Create a handle, page, to handle the contents of the website
pages = [requests.get(url) for url in urls]
#Store the page as an element tree using BeautifulSoup4
soups = [BeautifulSoup(page.content) for page in pages]

In [148]:
team_tables = [
  [
    [
      [td.get_text(strip=True) for td in tr.find_all('td')] 
      for tr in table.find_all('tr') 
    ]#for each row in each table
    for table in soup.find_all('table') 
  ]#for each table on each webpage
  for soup in soups 
]#for each team's webpage

The for-loop below helps us find which tables on Grace College's webpage.  It appears that the header is the 33rd table, and the actual data is in the 35th table.  It seems like a fair assumption that this will be the same for all of the other teams as well.

In [6]:
for i in range(len(team_tables[2])):
  print(i, team_tables[2][i])
  #The line ablve is commented out becuause we only needed to run it once to find the location of the data on the webpage.
  pass

0 <Element table at 0x143e97d5270>
1 <Element table at 0x143e97d52c0>
2 <Element table at 0x143e97d5310>
3 <Element table at 0x143e97d5360>
4 <Element table at 0x143e97d53b0>
5 <Element table at 0x143e97d5400>
6 <Element table at 0x143e97d5450>
7 <Element table at 0x143e97d54a0>
8 <Element table at 0x143e97d54f0>
9 <Element table at 0x143e97d5540>
10 <Element table at 0x143e97d5590>
11 <Element table at 0x143e97d55e0>
12 <Element table at 0x143e97d5630>
13 <Element table at 0x143e97d5680>
14 <Element table at 0x143e97d56d0>
15 <Element table at 0x143e97d5720>
16 <Element table at 0x143e97d5770>
17 <Element table at 0x143e97d57c0>
18 <Element table at 0x143e97d5810>
19 <Element table at 0x143e97d5860>
20 <Element table at 0x143e97d58b0>
21 <Element table at 0x143e97d5900>
22 <Element table at 0x143e97d5950>
23 <Element table at 0x143e97d59a0>
24 <Element table at 0x143e97d59f0>
25 <Element table at 0x143e97d5a40>
26 <Element table at 0x143e97d5a90>
27 <Element table at 0x143e97d5ae0>
28

In [151]:
headers = [['Date', 'Opponent', 'Location', 'Score', 'Outcome'] for tables in team_tables]
headers[2]

['Date', 'Opponent', 'Location', 'Score', 'Outcome']

In the code below, we use `[:5]` to skip the last column, and we use `[1::2]` because the rows with actual data are the odd rows.

In [152]:
team_rows = [[r[:5] for r in tables[35][1::2]] for tables in team_tables]
team_rows[2][:9]

[['2/27/2019', 'Lourdes (Ohio)', 'N', '3-4', 'L'],
 ['2/27/2019', 'Lourdes (Ohio)', 'N', '4-8', 'L'],
 ['3/2/2019', 'Cornerstone (Mich.)', 'N', '3-4', 'L'],
 ['3/2/2019', 'Trinity Baptist', 'N', '5-1', 'W'],
 ['3/4/2019', 'Michigan-Dearborn', 'N', '13-1', 'W'],
 ['3/5/2019', 'Rochester (Mich.)', 'N', '24-4', 'W'],
 ['3/6/2019', 'Robert Morris (Ill.)', 'N', '10-9', 'W'],
 ['3/8/2019', 'Bethel (Ind.) *', 'N', '13-6', 'W'],
 ['3/9/2019', 'Bethel (Ind.) *', 'N', '14-2', 'W']]

In [153]:
dfc = [pd.DataFrame(columns = headers[i], data = team_rows[i]) for i in range(len(headers))]
dfc[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome
0,2/27/2019,Lourdes (Ohio),N,3-4,L
1,2/27/2019,Lourdes (Ohio),N,4-8,L
2,3/2/2019,Cornerstone (Mich.),N,3-4,L
3,3/2/2019,Trinity Baptist,N,5-1,W
4,3/4/2019,Michigan-Dearborn,N,13-1,W


Get only the conference games

In [159]:
conf_df = [df[df.Opponent.str.contains("*", regex = False)] for df in dfc]
conf_df[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome
7,3/8/2019,Bethel (Ind.) *,N,13-6,W
8,3/9/2019,Bethel (Ind.) *,N,14-2,W
9,3/9/2019,Bethel (Ind.) *,N,3-1,W
10,3/14/2019,Taylor (Ind.) *,A,5-15,L
11,3/16/2019,Taylor (Ind.) *,A,2-10,L


In the code below, `.copy()` makes a copy of each dataframe, which prevents warning messages.  `.str.replace(r"\(.*\)","")` deletes any parententheses along with anything inside them.  Then `st.split()` is used to split the Scores column into two columns.

After that we use `.assign()` to assign the correct data types.

In [160]:
tidy_conf = conf_df.copy()
for i, df in enumerate(conf_df):
  split_scores = df['Score'].str.replace(r"\(.*\)","").str.split('-', expand = True)
  tidy_conf[i] = df.assign(Score = pd.to_numeric(split_scores[0]),
                           Other_score = pd.to_numeric(split_scores[1]),
                           Opponent = df.Opponent.str.replace(' \*', '', regex= True),
                           Date = pd.to_datetime(df.Date)
                           )
tidy_conf[2][:5]

Unnamed: 0,Date,Opponent,Location,Score,Outcome,Other_score
7,2019-03-08,Bethel (Ind.),N,13,W,6
8,2019-03-09,Bethel (Ind.),N,14,W,2
9,2019-03-09,Bethel (Ind.),N,3,W,1
10,2019-03-14,Taylor (Ind.),A,5,L,15
11,2019-03-16,Taylor (Ind.),A,2,L,10


In [157]:
conf_df_h = [df[df.Location.str.contains("H", regex = False)] for df in tidy_conf]
conf_df_a = [df[df.Location.str.contains("A", regex = False)] for df in tidy_conf]

In [158]:
home_scores = [df['Score'].tolist() for df in conf_df_h]
away_scores = [df['Score'].tolist() for df in conf_df_a]
home_scores[2]

[12, 2, 10, 16, 3, 1, 0, 4, 3, 5, 2, 2]