In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import matplotlib.pyplot as plt
import altair as alt
import plotly.express as px

In [12]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,Year,Age,Tm,Lg,G,PA,AB,R,H,2B,...,TB,GDP,HBP,SH,SF,IBB,Pos,Awards,WAR,Unnamed: 30
0,1954,20,MLN,NL,122.0,509.0,468.0,58.0,131.0,27.0,...,209.0,13.0,3.0,6.0,4.0,0.0,*79/H,RoY-4,143.0,
1,1962,22,MLN,NL,141.0,382.0,334.0,54.0,77.0,20.0,...,125.0,10.0,0.0,4.0,3.0,0.0,*37H/45,,-2.8,
2,1998,25,CHW,AL,89.0,261.0,244.0,33.0,68.0,14.0,...,120.0,2.0,0.0,2.0,5.0,1.0,897H/D,,-1.4,
3,1994,25,FLA,NL,101.0,371.0,345.0,41.0,86.0,17.0,...,136.0,5.0,5.0,3.0,2.0,1.0,*6/H,,0.5,
4,2006,25,FLA,NL,111.0,281.0,255.0,39.0,54.0,12.0,...,85.0,2.0,3.0,4.0,1.0,2.0,8H9/7,,-0.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4378,1973,24,PIT,NL,103.0,354.0,333.0,44.0,108.0,23.0,...,175.0,8.0,0.0,0.0,0.0,0.0,97H,RoY-9,25.1,
4379,2006,25,TBD,AL,52.0,198.0,183.0,10.0,41.0,6.0,...,57.0,2.0,0.0,2.0,3.0,1.0,6/H,,44.5,
4380,2001,26,CHC,NL,49.0,118.0,106.0,11.0,23.0,3.0,...,44.0,3.0,3.0,0.0,1.0,1.0,3H,,-0.2,
4381,1992,25,BOS,AL,124.0,432.0,392.0,46.0,108.0,19.0,...,138.0,6.0,4.0,7.0,4.0,1.0,*879H/D,,-0.7,


In [13]:
data.WAR.describe()

count    4383.000000
mean        8.375405
std        16.094450
min        -6.900000
25%        -0.400000
50%         1.500000
75%        10.500000
max       162.800000
Name: WAR, dtype: float64

This distribution tells us a lot about the WAR statistic. The median is 1.5, while the mean is 8.375. This indicates there are major outliers on the upper end of the scale, pulling up the mean. This makes a lot of sense, since players have high WAR ceiling, but not low ones. Players with low WAR's will not have as many opportunities with teams.

In [14]:
import plotly.express as px
fig = px.box(data, y="WAR", title='Box and Whisker Plot of WAR')
fig.show()

The box and whisker plot of WAR is interesting and indicitive of some potential points of issue in our model. As seen in the plot, we have an incredible amount of outliers.

In [15]:
px.scatter(data,
           x="HR",
           y="WAR",
           title="Home Runs vs WAR")

The plot of Home Runs vs. WAR is an interesting visualization becase we don't really see any direct correlation between the two features. There are lots of players who have zero home runs and a good WAR and there are players who have dozens of home runs and a terrible WAR.

In [16]:
px.scatter(data,
           x="AB",
           y="WAR",
           title="At Bats vs WAR")

In [17]:
px.scatter(data,
           x="R",
           y="WAR",
           title="Runs vs WAR")

In [18]:
px.scatter(data,
           x="OPS+",
           y="WAR",
           title="OPS+ vs WAR")

In [19]:
px.scatter(data,
           x="TB",
           y="WAR",
           title="Total Bases vs WAR")

In [20]:
px.scatter(data,
           x="BA",
           y="WAR",
           title="Batting Average vs WAR")

In [21]:
data[["WAR", "R", 'AB', 'G', 'HR', 'TB', 'OPS+', 'BA', 'RBI', 'H', 'SO']].corr()

Unnamed: 0,WAR,R,AB,G,HR,TB,OPS+,BA,RBI,H,SO
WAR,1.0,0.430129,0.359966,0.297134,0.337828,0.417056,0.310772,0.249536,0.392324,0.389164,0.230391
R,0.430129,1.0,0.917025,0.848204,0.638797,0.941742,0.488633,0.47304,0.836353,0.936311,0.652569
AB,0.359966,0.917025,1.0,0.925597,0.579094,0.953079,0.369394,0.393774,0.846959,0.97972,0.686193
G,0.297134,0.848204,0.925597,1.0,0.536335,0.870378,0.318793,0.32355,0.779762,0.894691,0.656006
HR,0.337828,0.638797,0.579094,0.536335,1.0,0.747051,0.55501,0.270433,0.808806,0.579357,0.704769
TB,0.417056,0.941742,0.953079,0.870378,0.747051,1.0,0.547044,0.507933,0.928759,0.971055,0.707642
OPS+,0.310772,0.488633,0.369394,0.318793,0.55501,0.547044,1.0,0.807117,0.542013,0.472898,0.315241
BA,0.249536,0.47304,0.393774,0.32355,0.270433,0.507933,0.807117,1.0,0.447494,0.532096,0.130462
RBI,0.392324,0.836353,0.846959,0.779762,0.808806,0.928759,0.542013,0.447494,1.0,0.862929,0.669338
H,0.389164,0.936311,0.97972,0.894691,0.579357,0.971055,0.472898,0.532096,0.862929,1.0,0.62842


In this correlation table, we are only concerned with the WAR row, which gives us some interesting insights. The highest correlations with WAR come from R, TB, and RBI. These are definitely some variables we will experiment with as features to our machine learning models.