In [1]:
import pandas as pd
import os

In [2]:
team_splits = pd.read_pickle(os.path.join('..','all_that_pickle','team_splits_periods.pickle'))
scoring = pd.read_pickle(os.path.join('..','all_that_pickle','scoring.pickle'))

In [3]:
ducks = team_splits.loc[team_splits["name"] == "Anaheim Ducks", :]
ducks = ducks.sort_index()
ducks.head()

Unnamed: 0_level_0,name,L,OL,T,W
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-01,Anaheim Ducks,5.0,2.0,,4.0
2006-02,Anaheim Ducks,5.0,2.0,,5.0
2006-03,Anaheim Ducks,3.0,2.0,,10.0
2006-04,Anaheim Ducks,0.0,2.0,,1.0
2006-10,Anaheim Ducks,0.0,3.0,,9.0


In [4]:
ducks.resample("A").sum()  # A = year end

Unnamed: 0_level_0,L,OL,T,W
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,20.0,14.0,0.0,48.0
2007,27.0,8.0,0.0,47.0
2008,33.0,7.0,0.0,42.0
2009,32.0,11.0,0.0,39.0
2010,30.0,5.0,0.0,47.0
2011,36.0,12.0,0.0,34.0


In [7]:
ducks.index

PeriodIndex(['2006-01', '2006-02', '2006-03', '2006-04', '2006-10', '2006-11',
             '2006-12', '2007-01', '2007-02', '2007-03', '2007-04', '2007-09',
             '2007-10', '2007-11', '2007-12', '2008-01', '2008-02', '2008-03',
             '2008-04', '2008-10', '2008-11', '2008-12', '2009-01', '2009-02',
             '2009-03', '2009-04', '2009-10', '2009-11', '2009-12', '2010-01',
             '2010-02', '2010-03', '2010-04', '2010-10', '2010-11', '2010-12',
             '2011-01', '2011-02', '2011-03', '2011-04', '2011-10', '2011-11',
             '2011-12'],
            dtype='period[M]', name='month', freq='M')

In [8]:
# To add a year, we convert index values to timestamps in order to add offsets & then map over them, adding year
# if the month happens to be before May.

ducks.index = ducks.to_timestamp().index.map(
                lambda x:x + pd.DateOffset(years=1) if x.month < 5 else x
                )
ducks = ducks.sort_index()
ducks.head()

Unnamed: 0_level_0,name,L,OL,T,W
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-10-01,Anaheim Ducks,0.0,3.0,,9.0
2006-11-01,Anaheim Ducks,2.0,3.0,,10.0
2006-12-01,Anaheim Ducks,5.0,0.0,,9.0
2007-01-01,Anaheim Ducks,5.0,2.0,,4.0
2007-02-01,Anaheim Ducks,5.0,2.0,,5.0


In [10]:
ducks.index[0:10]

DatetimeIndex(['2006-10-01', '2006-11-01', '2006-12-01', '2007-01-01',
               '2007-02-01', '2007-03-01', '2007-04-01', '2007-09-01',
               '2007-10-01', '2007-11-01'],
              dtype='datetime64[ns]', name='month', freq=None)

In [11]:
resampler = ducks.resample("A-JUN")
resampler

DatetimeIndexResampler [freq=<YearEnd: month=6>, axis=0, closed=right, label=right, convention=start, base=0]

In [12]:
resampler.sum()

Unnamed: 0_level_0,L,OL,T,W
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-06-30,20.0,14.0,0.0,48.0
2008-06-30,27.0,8.0,0.0,47.0
2009-06-30,33.0,7.0,0.0,42.0
2010-06-30,32.0,11.0,0.0,39.0
2011-06-30,30.0,5.0,0.0,47.0
2012-06-30,36.0,12.0,0.0,34.0


In [13]:
# We cannot call expanding directly on the resampler but same as for windows, we can perform any computation on the
# resampler object. Pandas does not allow expanding operation on datetime index. so need to reset index for each x.
# We can't do this: resampler.expanding().sum()

res = resampler.apply(lambda x: x.reset_index()[["W","L"]].expanding().sum())
res.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,W,L
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2007-06-30,0,9.0,0.0
2007-06-30,1,19.0,2.0
2007-06-30,2,28.0,7.0
2007-06-30,3,32.0,12.0
2007-06-30,4,37.0,17.0
2007-06-30,5,47.0,20.0
2007-06-30,6,48.0,20.0
2008-06-30,0,1.0,1.0


In [17]:
res.index = ducks.index # we have the same no of rows as the original data & the order is preserved.
res.head()

Unnamed: 0_level_0,W,L
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-10-01,9.0,0.0
2006-11-01,19.0,2.0
2006-12-01,28.0,7.0
2007-01-01,32.0,12.0
2007-02-01,37.0,17.0


In [15]:
# To get uniform frequency
final = res.asfreq(pd.tseries.offsets.MonthBegin())
final.head(10)

Unnamed: 0_level_0,W,L
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2006-10-01,9.0,0.0
2006-11-01,19.0,2.0
2006-12-01,28.0,7.0
2007-01-01,32.0,12.0
2007-02-01,37.0,17.0
2007-03-01,47.0,20.0
2007-04-01,48.0,20.0
2007-05-01,,
2007-06-01,,
2007-07-01,,


In [23]:
#!pip install bokeh

In [26]:
from bokeh.plotting import figure, output_notebook, show
output_notebook()

In [25]:
from bokeh.models.sources import ColumnDataSource as cds

In [35]:
source = cds(final)
p = figure(x_axis_type="datetime", height=500, width=500)
p.line(source=source, x="month", y="W", color="green", legend="Wins")
p.line(source=source, x="month", y="L", color="red", legend="Loses")
p.legend.click_policy="hide" #enables clicking on the legend to hide/display
p.title.text="Anaheim Ducks Performance by Season"
p.title.text_font_size="20px"
p.title.align="center";

In [36]:
show(p)  # green = wins & red = losses

In [38]:
wayne = scoring.set_index("playerID").loc['gretzwa01']
wayne = wayne[['year','tmID','GP','Pts']]
wayne = wayne.assign(pts_per_game = lambda x:x['Pts'] / x['GP'])
wayne.head()

Unnamed: 0_level_0,year,tmID,GP,Pts,pts_per_game
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gretzwa01,1980,EDM,80.0,164.0,2.05
gretzwa01,1981,EDM,80.0,212.0,2.65
gretzwa01,1982,EDM,80.0,196.0,2.45
gretzwa01,1983,EDM,74.0,205.0,2.77027
gretzwa01,1984,EDM,80.0,208.0,2.6


In [39]:
wayne.tmID.head()

playerID
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
Name: tmID, dtype: category
Categories (37, object): [ANA, AND, ATL, BOS, ..., VAN, WAS, WIN, WPG]

In [40]:
# for categorical columns, all the methods related to the categorical values handling are accessed through cat
# attribute, which is the same conventions for string operations.

wayne.loc[:, "tmID"] = wayne.tmID.cat.remove_unused_categories()
wayne.tmID.head()

playerID
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
gretzwa01    EDM
Name: tmID, dtype: category
Categories (4, object): [EDM, LAK, NYR, STL]

In [41]:
# all categorical variables have integers through which all the possible values are mapped.We can get these integers
# as a column by using the code attribute.So we know exactly how tmIDs will map to the numerical values on the Y axis
wayne.loc[:, "tmCode"] = wayne["tmID"].cat.codes
wayne.sample(5)

Unnamed: 0_level_0,year,tmID,GP,Pts,pts_per_game,tmCode
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gretzwa01,1984,EDM,80.0,208.0,2.6,0
gretzwa01,1983,EDM,74.0,205.0,2.77027,0
gretzwa01,1985,EDM,80.0,215.0,2.6875,0
gretzwa01,1987,EDM,64.0,149.0,2.328125,0
gretzwa01,1995,STL,18.0,21.0,1.166667,3


In [42]:
# Bokeh does not recognise Period type, so we need to convert into timestamp
wayne.loc[:, "year"] = pd.to_datetime(wayne.year, format="%Y")

In [43]:
gp_max = wayne.GP.max()
gp_min = wayne.GP.min()
pts_per_game_max = wayne.pts_per_game.max()
pts_per_game_min = wayne.pts_per_game.min()

In [44]:
wayne.loc[:, "height"] = wayne.GP / gp_max
wayne.loc[:, "bottom"] = wayne.tmCode
wayne.loc[:, "top"] = wayne.bottom + wayne.height
wayne.head()

Unnamed: 0_level_0,year,tmID,GP,Pts,pts_per_game,tmCode,height,bottom,top
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
gretzwa01,1980-01-01,EDM,80.0,164.0,2.05,0,0.97561,0,0.97561
gretzwa01,1981-01-01,EDM,80.0,212.0,2.65,0,0.97561,0,0.97561
gretzwa01,1982-01-01,EDM,80.0,196.0,2.45,0,0.97561,0,0.97561
gretzwa01,1983-01-01,EDM,74.0,205.0,2.77027,0,0.902439,0,0.902439
gretzwa01,1984-01-01,EDM,80.0,208.0,2.6,0,0.97561,0,0.97561


In [49]:
src = cds(wayne)
p = figure(x_axis_type='datetime', height=500, width=500, y_range=list(wayne.tmID.cat.categories))
p.vbar(bottom='bottom', top='top', x='year', width=1, source=src)

p.xaxis.major_label_text_font_size = '9pt'
p.yaxis.major_label_text_font_size = '9pt'

In [50]:
show(p)

In [51]:
# to make the above plot more nicer..
reordered = wayne.sort_values('year').tmID.unique()
reordered

[EDM, LAK, STL, NYR]
Categories (4, object): [EDM, LAK, STL, NYR]

In [65]:
wayne.loc[:, "tmID"] = wayne.tmID.cat.reorder_categories(reordered)
wayne.loc[:, "tmCode"] = wayne["tmID"].cat.codes
wayne.loc[:, "height"] = wayne.GP / gp_max * 0.8 # add more vertical spacing
wayne.loc[:, "bottom"] = wayne.tmCode - (wayne.height / 2.0) + 0.5 #make the middle of the bar at the middle level
wayne.loc[:, "top"] = wayne.bottom + wayne.height

In [66]:
width = 320 * 24 * 60 * 60 * 1000

In [None]:
# To add color to the plot
from bokeh.models import LinearColorMapper, ColorBar
color_mapper = LinearColorMapper(
                                palette="Plasma256",
                                low=pts_per_game_min,
                                high=pts_per_game_max)

In [77]:
# Adding HoverTool
from bokeh.models.tools import HoverTool
hover = HoverTool(
            tooltips=[
                ("Points per game", "@pts_per_game"),
                ("Games played", "@GP")
            ])

In [75]:
src = cds(wayne)
p = figure(x_axis_type='datetime', height=500, width=500, y_range=list(wayne.tmID.cat.categories))
p.vbar(bottom='bottom', top='top', x='year', width=width, source=src, 
       color={'field': 'pts_per_game', 'transform': color_mapper})
# adding legends for the colored bars
color_bar = ColorBar(color_mapper=color_mapper,
                     label_standoff=8, border_line_color=None,
                     location=(0,0))
p.add_layout(color_bar, 'right');
p.xaxis.major_label_text_font_size = '9pt'
p.yaxis.major_label_text_font_size = '9pt'

In [76]:
show(p)