In [1]:
import pandas as pd
import numpy as np
import time
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the main dataset

In [2]:
wagering_data = pd.read_excel("Raw Data/Wagering Data.xlsx", sheet_name = "Tabelle1")

In [3]:
display(wagering_data.head(60))
wagering_data.tail(60)

Unnamed: 0,Jurisdiction,Handle,Revenue,Hold,Taxes/Jurisdiction Revenue
0,Arizona,"$5,972,178,331","$478,920,227",8.0%,"$24,305,661"
1,Arkansas,"$235,846,025","$25,667,848",10.9%,"$3,504,223"
2,Colorado,"$9,144,348,841","$599,387,809",6.6%,"$29,134,877"
3,Connecticut,"$1,531,042,830","$130,539,140",8.5%,"$12,855,757"
4,Delaware,"$534,230,610","$80,745,049",15.1%,"$53,577,288"
5,Illinois,"$16,597,597,363","$1,261,861,306",7.6%,"$203,610,968"
6,Indiana,"$10,071,140,268","$830,351,639",8.2%,"$78,883,408"
7,Iowa,"$4,947,057,137","$319,600,659",6.5%,"$23,474,052"
8,Kansas,"$536,800,390","$10,737,110",2.0%,"$1,066,490"
9,Louisiana,"$2,118,561,639","$162,252,420",7.7%,"$30,673,123"


Unnamed: 0,Jurisdiction,Handle,Revenue,Hold,Taxes/Jurisdiction Revenue
1146,2022-08-01 00:00:00,"$128,251,857","$9,460,039",7.4%,"$1,774,343"
1147,2022-09-01 00:00:00,"$207,501,595","$31,894,674",15.4%,"$3,870,435"
1148,October 2022,"$255,528,321","$30,243,842",11.8%,"$4,305,414"
1149,2022-11-01 00:00:00,"$268,638,967","($25,622,399)",-9.5%,"$4,244,196"
1150,Total,"$2,118,561,639","$162,252,420",7.7%,"$30,673,123"
1151,,,,,
1152,Louisiana revenue here.,,,,
1153,,,,,
1154,Arizona sports betting,,,,
1155,,,,,


# Formatting the dataframe

### Seperating the summary from the rest of the data

The first 28 lines are a summary for 2021. We save it seperately in case we want to use it later.

In [4]:
df_summary_21 = wagering_data.iloc[0:29]
df_summary_21.head(3)

Unnamed: 0,Jurisdiction,Handle,Revenue,Hold,Taxes/Jurisdiction Revenue
0,Arizona,"$5,972,178,331","$478,920,227",8.0%,"$24,305,661"
1,Arkansas,"$235,846,025","$25,667,848",10.9%,"$3,504,223"
2,Colorado,"$9,144,348,841","$599,387,809",6.6%,"$29,134,877"


Now we can delete this part from our dataframe.

In [5]:
wagering_data.shape

(1206, 5)

In [6]:
wagering_data2 = wagering_data.iloc[29:]
wagering_data2.head(20)

Unnamed: 0,Jurisdiction,Handle,Revenue,Hold,Taxes/Jurisdiction Revenue
29,New Jersey sports betting,,,,
30,,,,,
31,NJ sports betting is the biggest outside Nevad...,,,,
32,,,,,
33,,Handle,Revenue,Hold,Taxes
34,June 2018,"$16,409,619","$3,458,668",21.1%,"$337,077"
35,July 2018,"$40,682,237","$3,845,880",9.5%,"$377,015"
36,2018-08-01 00:00:00,"$95,634,048","$9,198,272",9.6%,"$1,038,073"
37,2018-09-01 00:00:00,"$183,948,404","$23,775,366",12.9%,"$2,883,517"
38,October 2018,"$260,711,301","$11,686,119",4.5%,"$1,536,282"


### Converting the column names

As we can see, the column names were applied to the summary we seperated, the column "Jurisdiction"
is actually the date column. We change it accordingly and format the names to snakecase.

In [7]:
# Rename Jurisdiction
wagering_data2 = wagering_data2.rename(columns={'Jurisdiction':'date'})

In [8]:
# Snake case
cols = []
for column in wagering_data2.columns:
    cols.append(column.lower())
wagering_data2.columns = cols
cols = []
for column in wagering_data2.columns:
    cols.append(column.replace(' ','_'))
wagering_data2.columns = cols

### Bringing the dataset into one fromat

In [9]:
# Inspecting the first column
wagering_data2.date.value_counts(dropna = False)

NaN                                                                                                                                                                                                                                     150
Total                                                                                                                                                                                                                                    28
2022-09-01 00:00:00                                                                                                                                                                                                                      28
January 2022                                                                                                                                                                                                                             27
2022-08-01 00:00:00                                     

As this data is a copy from a html site, the formatting is nowhere near a standardized dataframe.
First we will get rid of all the empty rows.
We do this by deleting all rows where the first column is empty.
With this step we also get rid of the duplicated column descriptions.

We can do this safely, because if there is data without a date attached we could not use it for the analysis either way.

In [10]:
wagering_data3 = wagering_data2[~wagering_data2['date'].isna()]
wagering_data3.shape

(1027, 5)

In [11]:
display(wagering_data3.head(60))
display(wagering_data3.tail(60))

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue
29,New Jersey sports betting,,,,
31,NJ sports betting is the biggest outside Nevad...,,,,
34,June 2018,"$16,409,619","$3,458,668",21.1%,"$337,077"
35,July 2018,"$40,682,237","$3,845,880",9.5%,"$377,015"
36,2018-08-01 00:00:00,"$95,634,048","$9,198,272",9.6%,"$1,038,073"
37,2018-09-01 00:00:00,"$183,948,404","$23,775,366",12.9%,"$2,883,517"
38,October 2018,"$260,711,301","$11,686,119",4.5%,"$1,536,282"
39,2018-11-01 00:00:00,"$330,748,563","$21,243,865",6.4%,"$2,730,521"
40,December 2018,"$319,173,548","$20,814,222",6.5%,"$2,695,290"
41,January 2019,"$385,279,662","$18,777,582",4.9%,"$2,532,619"


Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue
1126,2022-09-01 00:00:00,"$130,547,719","$16,676,506",12.8%,"$1,777,391"
1127,October 2022,"$164,170,106","$15,499,673",9.4%,"$1,736,473"
1128,Total,"$1,531,042,830","$130,539,140",8.5%,"$12,855,757"
1130,Connecticut revenue here.,,,,
1132,Louisiana sports betting,,,,
1134,Legal sports betting at commercial casinos beg...,,,,
1137,2021-11-01 00:00:00,"$27,871,038","$5,685,706",20.4%,"$568,571"
1138,December 2021,"$39,517,545","$4,380,700",11.1%,"$438,070"
1139,January 2022,"$89,757,060","($3,703,347)",-4.1%,"$533,259"
1140,February 2022,"$238,413,596","$17,294,691",7.3%,"$2,354,846"


### Creating a state column

There is no column that indicates the state, we can see, that the state is mentioned as the first word in a text in the date column, followed by additional text and then a variing number of rows of actual data.

We can see the following pattern.
The string that contains the state name for the following rows is always in the following format:
"'State name' sports betting."

So, what we will do is create a new column named state and insert the part of the string before sports betting for the following rows until we find the next one that matches the pattern.

In [12]:
# Creating the new column:
wagering_data4 = wagering_data3[['date', 'handle', 'revenue', 'hold', 'taxes/jurisdiction_revenue']]
wagering_data4['state'] = ""
wagering_data4.head(5)

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
29,New Jersey sports betting,,,,,
31,NJ sports betting is the biggest outside Nevad...,,,,,
34,June 2018,"$16,409,619","$3,458,668",21.1%,"$337,077",
35,July 2018,"$40,682,237","$3,845,880",9.5%,"$377,015",
36,2018-08-01 00:00:00,"$95,634,048","$9,198,272",9.6%,"$1,038,073",


In [13]:
import re

In [14]:
# Testing the regex to see if we find the desired pattern

for row in wagering_data4.index:
        if re.match('.* (sports betting)$', str(wagering_data4.date[row])):
            print(wagering_data4.date[row].replace(" sports betting", ""))



New Jersey
Pennsylvania
Delaware
Mississippi
Nevada
Rhode Island
West Virginia
Arkansas
New York
Iowa
Indiana
Oregon
New Hampshire
Michigan
Montana
Colorado
Washington DC
Illinois
Tennessee
Virginia
Wyoming
South Dakota
Connecticut
Louisiana
Arizona
Maryland
Kansas


In [15]:
# Inserting the state into the according column
current_state = ""
for row in wagering_data4.index:
    if re.match('.* (sports betting)$', str(wagering_data4.date[row])):
        current_state = wagering_data4.date[row].replace(" sports betting", "")
    wagering_data4.state[row] = current_state

In [16]:
display(wagering_data4.head(60))
display(wagering_data4.tail(60))

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
29,New Jersey sports betting,,,,,New Jersey
31,NJ sports betting is the biggest outside Nevad...,,,,,New Jersey
34,June 2018,"$16,409,619","$3,458,668",21.1%,"$337,077",New Jersey
35,July 2018,"$40,682,237","$3,845,880",9.5%,"$377,015",New Jersey
36,2018-08-01 00:00:00,"$95,634,048","$9,198,272",9.6%,"$1,038,073",New Jersey
37,2018-09-01 00:00:00,"$183,948,404","$23,775,366",12.9%,"$2,883,517",New Jersey
38,October 2018,"$260,711,301","$11,686,119",4.5%,"$1,536,282",New Jersey
39,2018-11-01 00:00:00,"$330,748,563","$21,243,865",6.4%,"$2,730,521",New Jersey
40,December 2018,"$319,173,548","$20,814,222",6.5%,"$2,695,290",New Jersey
41,January 2019,"$385,279,662","$18,777,582",4.9%,"$2,532,619",New Jersey


Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
1126,2022-09-01 00:00:00,"$130,547,719","$16,676,506",12.8%,"$1,777,391",Connecticut
1127,October 2022,"$164,170,106","$15,499,673",9.4%,"$1,736,473",Connecticut
1128,Total,"$1,531,042,830","$130,539,140",8.5%,"$12,855,757",Connecticut
1130,Connecticut revenue here.,,,,,Connecticut
1132,Louisiana sports betting,,,,,Louisiana
1134,Legal sports betting at commercial casinos beg...,,,,,Louisiana
1137,2021-11-01 00:00:00,"$27,871,038","$5,685,706",20.4%,"$568,571",Louisiana
1138,December 2021,"$39,517,545","$4,380,700",11.1%,"$438,070",Louisiana
1139,January 2022,"$89,757,060","($3,703,347)",-4.1%,"$533,259",Louisiana
1140,February 2022,"$238,413,596","$17,294,691",7.3%,"$2,354,846",Louisiana


### Dropping all rows, that contain text instead of our desired data

We can see, that in the text columns, all other columns except the state column are empty, so we delete those where this condition applies.

In [17]:
wagering_data4.isna().sum()

date                            0
handle                        101
revenue                       101
hold                          103
taxes/jurisdiction_revenue    103
state                           0
dtype: int64

In [18]:
display(wagering_data4[wagering_data4.handle.isna() & wagering_data4.revenue.isna()].head(50))
display(wagering_data4[wagering_data4.handle.isna() & wagering_data4.revenue.isna()].tail(50))

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
29,New Jersey sports betting,,,,,New Jersey
31,NJ sports betting is the biggest outside Nevad...,,,,,New Jersey
90,NJ revenue here.,,,,,New Jersey
92,Pennsylvania sports betting,,,,,Pennsylvania
94,PA sports betting is poised to become one of t...,,,,,Pennsylvania
96,The only hiccup is the high cost of doing busi...,,,,,Pennsylvania
150,PA revenue here.,,,,,Pennsylvania
152,Delaware sports betting,,,,,Delaware
154,Delaware sports betting was the first to go li...,,,,,Delaware
156,Delaware’s monthly reporting schedule ends on ...,,,,,Delaware


Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
711,New Hampshire sports betting launched in late ...,,,,,New Hampshire
713,DraftKings Sportsbook can also open retail loc...,,,,,New Hampshire
753,New Hampshire revenue here.,,,,,New Hampshire
755,Michigan sports betting,,,,,Michigan
757,Michigan sports betting launched retail bettin...,,,,,Michigan
759,Casinos did not reopen until Aug. 5 at 15% max...,,,,,Michigan
761,Michigan launched online sports betting Jan. 2...,,,,,Michigan
799,Michigan revenue here.,,,,,Michigan
801,Montana sports betting,,,,,Montana
803,Montana sports betting launched with retail an...,,,,,Montana


We can see here that all rows where handle and revenue are empty are the text rows, so we go on to delete them.

In [19]:
wagering_data5 = wagering_data4[~ (wagering_data4.handle.isna() & wagering_data4.revenue.isna())]

In [20]:
wagering_data5.shape

(926, 6)

In [21]:
wagering_data5.isna().sum()

date                          0
handle                        0
revenue                       0
hold                          2
taxes/jurisdiction_revenue    2
state                         0
dtype: int64

In [22]:
wagering_data5.head(10)

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
34,June 2018,"$16,409,619","$3,458,668",21.1%,"$337,077",New Jersey
35,July 2018,"$40,682,237","$3,845,880",9.5%,"$377,015",New Jersey
36,2018-08-01 00:00:00,"$95,634,048","$9,198,272",9.6%,"$1,038,073",New Jersey
37,2018-09-01 00:00:00,"$183,948,404","$23,775,366",12.9%,"$2,883,517",New Jersey
38,October 2018,"$260,711,301","$11,686,119",4.5%,"$1,536,282",New Jersey
39,2018-11-01 00:00:00,"$330,748,563","$21,243,865",6.4%,"$2,730,521",New Jersey
40,December 2018,"$319,173,548","$20,814,222",6.5%,"$2,695,290",New Jersey
41,January 2019,"$385,279,662","$18,777,582",4.9%,"$2,532,619",New Jersey
42,February 2019,"$320,368,087","$12,732,740",4.0%,"$1,817,553",New Jersey
43,March 2019,"$372,451,342","$31,669,387",8.5%,"$4,180,051",New Jersey


As we can see we now have converted our initial file into a useful format

In [23]:
wagering_formatted = wagering_data5.copy()

# Data cleaning

### Cleaning and converting the date column

In [24]:
wagering_formatted['date'].value_counts()

Total                   28
2022-09-01 00:00:00     28
2022-08-01 00:00:00     27
July 2022               27
June 2022               27
May 2022                27
2022-04-01 00:00:00     27
March 2022              27
February 2022           27
January 2022            27
December 2021           27
October 2022            26
2021-11-01 00:00:00     26
October 2021            25
2021-09-01 00:00:00     24
2021-08-01 00:00:00     21
July 2021               21
June 2021               21
May 2021                21
2021-04-01 00:00:00     21
March 2021              21
February 2021           21
January 2021            21
December 2020           20
2020-11-01 00:00:00     20
October 2020            19
2020-09-01 00:00:00     19
July 2020               18
2020-08-01 00:00:00     18
June 2020               16
March 2020              16
May 2020                15
February 2020           13
January 2020            13
December 2019           13
2020-04-01 00:00:00     12
2019-11-01 00:00:00     12
O

As we can see we have still some rows left which are not dates, but summaries of the other data. We will also drop the according rows.

In [25]:
display(wagering_formatted[wagering_formatted['date'] == 'Total'].head(5))

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
88,Total,"$32,672,114,792","$2,283,153,922",7.0%,"$288,665,486",New Jersey
148,Total,"$18,137,307,269","$1,415,801,172",7.8%,"$346,690,811",Pennsylvania
212,Total,"$534,230,610","$80,745,049",15.1%,"$53,577,288",Delaware
274,Total,"$1,860,162,392","$213,668,015",11.5%,"$25,640,162",Mississippi
341,Total,"$27,723,080,786","$1,585,277,000",5.72%,"$107,006,198",Nevada


Apparently the Total is a summary for the state throughout the time, we don't need that, so we drop the rows.

In [26]:
wagering_formatted = wagering_formatted[~ (wagering_formatted['date'] == 'Total')]
wagering_formatted['date'].value_counts()

2022-09-01 00:00:00     28
2022-08-01 00:00:00     27
July 2022               27
June 2022               27
May 2022                27
2022-04-01 00:00:00     27
March 2022              27
February 2022           27
January 2022            27
December 2021           27
October 2022            26
2021-11-01 00:00:00     26
October 2021            25
2021-09-01 00:00:00     24
2021-08-01 00:00:00     21
July 2021               21
June 2021               21
May 2021                21
2021-04-01 00:00:00     21
March 2021              21
February 2021           21
January 2021            21
December 2020           20
2020-11-01 00:00:00     20
2020-09-01 00:00:00     19
October 2020            19
2020-08-01 00:00:00     18
July 2020               18
March 2020              16
June 2020               16
May 2020                15
February 2020           13
January 2020            13
December 2019           13
2020-04-01 00:00:00     12
2022-11-01 00:00:00     12
2019-11-01 00:00:00     12
O

In [27]:
# Now we have to check for the values Month, GambetDC, April-May 2020 and Commercial Operators
display(wagering_formatted[wagering_formatted['date'].isin(['April-May 2020','Month', 'GambetDC', 'Commercial Operators'])])

Unnamed: 0,date,handle,revenue,hold,taxes/jurisdiction_revenue,state
311,April-May 2020,"$56,263,737","$2,669,000",4.74%,"$180,158",Nevada
619,Month,Handle,Revenue,Hold,Taxes,Indiana
893,GambetDC,Handle,Revenue,Hold,State revenue,Washington DC
925,Commercial Operators,Handle,Revenue,Hold,Taxes,Washington DC


We can see that the last three are meaningless text columns, so we can drop them.

Apparently for 2020 the Nevada revenues for April-May have been summarized, we will split it into three rows and divide the values by two to harmonize it with the rest of the data.

In [28]:
# Dropping the rows
wagering_formatted = wagering_formatted[~ wagering_formatted['date'].isin(['Month', 'GambetDC', 'Commercial Operators'])]

In [29]:
# Creating new rows for April and May 2020
# We insert the values as strings so that we don't run into errors later when converting
# all the other values in the same column to integers.
wagering_formatted = wagering_formatted.append({'date': 'April 2020', 
                                                'handle':'28131868',
                                                'revenue':'1334500',
                                                'hold':'4.74',
                                                'taxes/jurisdiction_revenue':'9079',
                                                'state':'Nevada'
                                               },ignore_index=True)
wagering_formatted = wagering_formatted.append({'date': 'May 2020', 
                                                'handle':'28131868',
                                                'revenue':'1334500',
                                                'hold':'4.74',
                                                'taxes/jurisdiction_revenue':'9079',
                                                'state':'Nevada'
                                               },ignore_index=True)

  wagering_formatted = wagering_formatted.append({'date': 'April 2020',
  wagering_formatted = wagering_formatted.append({'date': 'May 2020',


In [30]:
# Now we can delete the initial summary row
wagering_formatted = wagering_formatted[~ wagering_formatted['date'].isin(['April-May 2020'])]

In [31]:
# Check the values now
wagering_formatted['date'].value_counts()

2022-09-01 00:00:00    28
2022-08-01 00:00:00    27
July 2022              27
June 2022              27
May 2022               27
2022-04-01 00:00:00    27
March 2022             27
February 2022          27
January 2022           27
December 2021          27
October 2022           26
2021-11-01 00:00:00    26
October 2021           25
2021-09-01 00:00:00    24
2021-08-01 00:00:00    21
July 2021              21
June 2021              21
May 2021               21
2021-04-01 00:00:00    21
March 2021             21
February 2021          21
January 2021           21
December 2020          20
2020-11-01 00:00:00    20
October 2020           19
2020-09-01 00:00:00    19
July 2020              18
2020-08-01 00:00:00    18
June 2020              16
May 2020               16
March 2020             16
February 2020          13
January 2020           13
December 2019          13
2020-04-01 00:00:00    12
2019-11-01 00:00:00    12
October 2019           12
2022-11-01 00:00:00    12
2019-09-01 0

Now we can convert the whole column to datetime format and harmonize the format.

In [32]:
wagering_formatted['date'] = pd.to_datetime(wagering_formatted['date'])

For us only month and year are relevant so we just use those.

In [33]:
wagering_formatted['month'] = wagering_formatted['date'].dt.strftime('%m')

In [34]:
wagering_formatted['year'] = wagering_formatted['date'].dt.strftime('%Y')

In [35]:
wagering_formatted = wagering_formatted.drop(['date'], axis= 1)

In [36]:
wagering_formatted.head(5)

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,"$16,409,619","$3,458,668",21.1%,"$337,077",New Jersey,6,2018
1,"$40,682,237","$3,845,880",9.5%,"$377,015",New Jersey,7,2018
2,"$95,634,048","$9,198,272",9.6%,"$1,038,073",New Jersey,8,2018
3,"$183,948,404","$23,775,366",12.9%,"$2,883,517",New Jersey,9,2018
4,"$260,711,301","$11,686,119",4.5%,"$1,536,282",New Jersey,10,2018


### Cleaning the handle and revenue columns

In both columns we have dollar values with a $ sign and comma seperators as strings.
There are aso negative values for revenue (in this case possible, more in the descriptive analytics),
either with a - or in parenthesis.
There are alsos ome whitespaces to be ereased.
We use a small cleaning function and apply it to clean both.

In [37]:
def moneyclean(x):
    x = str(x)
    x = x.replace('$','')
    x = x.replace(',','')
    x = x.replace('(','-')
    x = x.replace(')','')
    x = x.replace(' ','')
    if x.replace('-','').isdigit():
        return int(x)
    # We see that we couldn't convert all valuesm so we try to identify the problem
    else:
        print(x)
    return x

In [38]:
wagering_formatted.handle.unique

<bound method Series.unique of 0       $16,409,619
1       $40,682,237
2       $95,634,048
3      $183,948,404
4      $260,711,301
           ...     
892    $160,527,371
893    $189,919,311
894    $186,353,708
895        28131868
896        28131868
Name: handle, Length: 896, dtype: object>

In [39]:
wagering_formatted['handle'] = wagering_formatted['handle'].apply(moneyclean)

–
–
–
–
–
–
–


In [40]:
wagering_formatted['revenue'] = wagering_formatted['revenue'].apply(moneyclean)

–
–
–
–
–
–
–


In [41]:
wagering_formatted['taxes/jurisdiction_revenue'] = wagering_formatted['taxes/jurisdiction_revenue'].apply(moneyclean)

nan
nan
–
—
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
–
31393.43
1022841.29


In [42]:
# We replace the two values that weren't catched by our function manually.
wagering_formatted['taxes/jurisdiction_revenue'][wagering_formatted['taxes/jurisdiction_revenue'] == '31393.43'] = 31393
wagering_formatted['taxes/jurisdiction_revenue'][wagering_formatted['taxes/jurisdiction_revenue'] == '1022841.29'] = 1022841
wagering_formatted[wagering_formatted['taxes/jurisdiction_revenue'] == 1022841]

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
868,486097352,36326542,7.5%,1022841,Arizona,10,2021


In [43]:
wagering_formatted

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,16409619,3458668,21.1%,337077,New Jersey,06,2018
1,40682237,3845880,9.5%,377015,New Jersey,07,2018
2,95634048,9198272,9.6%,1038073,New Jersey,08,2018
3,183948404,23775366,12.9%,2883517,New Jersey,09,2018
4,260711301,11686119,4.5%,1536282,New Jersey,10,2018
...,...,...,...,...,...,...,...
892,160527371,1296184,0.8%,129618,Kansas,09,2022
893,189919311,1438196,0.8%,141088,Kansas,10,2022
894,186353708,8002730,4.3%,795784,Kansas,11,2022
895,28131868,1334500,4.74,9079,Nevada,04,2020


The missing values are potentially from covid restrictions in the corresponding states.
Technically the values should be 0, but that would very much influence the whole analysis for the year 2020. So I will make an approximation by inserting the mean value from two month before and after the restrictions.
I deliberately don't take the adjacent month for two reasons:
1. The restrictions could potentially have reached into the adjacent month, making their value also not representative.
2. Pent up demand can often be seen in a short Period before and after restrictions, resulting in misleading values.

In [44]:
# To ease our work from now, since we are done with dropping rows, we sort by state and date and reset the index.
wagering_formatted.sort_values(['state', 'year', 'month'], ascending=[True, True, True], inplace = True)
wagering_formatted.reset_index(inplace = True, drop = True)

In [45]:
wagering_formatted

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,291212868,31238322,10.7%,31393,Arizona,09,2021
1,486097352,36326542,7.5%,1022841,Arizona,10,2021
2,466725687,51416143,11.0%,3177148,Arizona,11,2021
3,499213733,39822928,8.0%,1723902,Arizona,12,2021
4,563694591,41890200,7.4%,1952971,Arizona,01,2022
...,...,...,...,...,...,...,...
891,7322852,673269,9.2%,38036,Wyoming,06,2022
892,7246352,680740,9.4%,38365,Wyoming,07,2022
893,10990904,1237815,11.3%,85672,Wyoming,08,2022
894,13575173,1932519,14.2%,115500,Wyoming,09,2022


In [46]:
# We inspect the rows with the dashes:
wagering_formatted[wagering_formatted['handle']== '–']

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
22,–,–,—,–,Arkansas,4,2020
148,–,–,,–,Illinois,4,2020
149,–,–,,–,Illinois,5,2020
287,–,–,—,–,Michigan,4,2020
288,–,–,—,–,Michigan,5,2020
289,–,–,—,–,Michigan,6,2020
290,–,–,—,–,Michigan,7,2020


In [47]:
wagering_formatted.iloc[20:28]

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
20,4294366,411589,9.6%,67494,Arkansas,2,2020
21,1588403,222373,14.0%,38310,Arkansas,3,2020
22,–,–,—,–,Arkansas,4,2020
23,509,-115000,-22.6%,-15,Arkansas,5,2020
24,1189,10,0.8%,1,Arkansas,6,2020
25,179315,-19633,-10.9%,-2552,Arkansas,7,2020
26,1008992,69625,6.9%,9051,Arkansas,8,2020
27,3958748,397095,10.0%,51622,Arkansas,9,2020


In [48]:
# Our desired values lie in the rows with the indexes 20 and 24 for Arkansas
for i in ['handle','revenue','taxes/jurisdiction_revenue']:
    wagering_formatted.iloc[22][i] = (wagering_formatted.iloc[20][i]+wagering_formatted.iloc[24][i])/2

In [49]:
wagering_formatted.iloc[22]

handle                        2147777.5
revenue                        205799.5
hold                                  —
taxes/jurisdiction_revenue      33747.5
state                          Arkansas
month                                04
year                               2020
Name: 22, dtype: object

In [50]:
# For Illinois we have to replace two month
wagering_formatted.iloc[145:153]

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
145,13545163,2463694,18.2%,1841226,Delaware,10,2022
146,10172819,1530355,15.0,1119178,Delaware,11,2022
147,997739,72028,7.2%,12242,Illinois,3,2020
148,–,–,,–,Illinois,4,2020
149,–,–,,–,Illinois,5,2020
150,8281803,368651,4.5%,59446,Illinois,6,2020
151,52524820,3623985,6.9%,585623,Illinois,7,2020
152,140065649,7233360,5.2%,1176754,Illinois,8,2020


In [51]:
# Our desired values lie in the rows with the indexes 146 and 151 for Illinois
for i in ['handle','revenue','taxes/jurisdiction_revenue']:
    wagering_formatted.iloc[148][i] = (wagering_formatted.iloc[146][i]+wagering_formatted.iloc[151][i])/2
    wagering_formatted.iloc[149][i] = (wagering_formatted.iloc[146][i]+wagering_formatted.iloc[151][i])/2

In [52]:
# For Michigan we do the same for 4 month
wagering_formatted.iloc[284:293]
# Our desired values lie in the rows with the indexes 285 and 292

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
284,39663586,5210947,13.1%,781642,Maryland,10,2022
285,219071374,30620539,14.0%,704728,Maryland,11,2022
286,593956,105548,17.8%,8866,Michigan,3,2020
287,–,–,—,–,Michigan,4,2020
288,–,–,—,–,Michigan,5,2020
289,–,–,—,–,Michigan,6,2020
290,–,–,—,–,Michigan,7,2020
291,15744256,1977052,12.6%,166072,Michigan,8,2020
292,33503929,4376407,13.1%,367618,Michigan,9,2020


In [53]:
for i in ['handle','revenue','taxes/jurisdiction_revenue']:
    wagering_formatted.iloc[287][i] = (wagering_formatted.iloc[285][i]+wagering_formatted.iloc[292][i])/2
    wagering_formatted.iloc[288][i] = (wagering_formatted.iloc[285][i]+wagering_formatted.iloc[292][i])/2
    wagering_formatted.iloc[289][i] = (wagering_formatted.iloc[285][i]+wagering_formatted.iloc[292][i])/2
    wagering_formatted.iloc[290][i] = (wagering_formatted.iloc[285][i]+wagering_formatted.iloc[292][i])/2

In [54]:
wagering_formatted[wagering_formatted['handle']== '–']
# We got rid of the missing values.

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year


There is still data missing for the 'taxes/jurisdiction_revenue' column.
I will have to investigate further to know what is the reason here.

In [55]:
wagering_formatted[wagering_formatted['taxes/jurisdiction_revenue']== '–']['state'].value_counts()

Oregon           37
Montana          32
Washington DC    30
Name: state, dtype: int64

Apparently the values affect just 3 states.

Oregon :
Oregon apparently does not disclose the amount of money withheld from sports betting.
But according to several press articles like
https://www.wweek.com/news/2022/11/30/as-sports-betting-takes-hold-nationally-oregon-lawmakers-ponder-what-comes-next-here/
the amount withheld is 51% of revenue, so we calculate the value according.

In [56]:
def oregon_tax(row):
    if row['state'] == 'Oregon':
        return row['revenue']*0.51
    return row['taxes/jurisdiction_revenue']

In [57]:
wagering_formatted['taxes/jurisdiction_revenue'] = wagering_formatted.apply(oregon_tax, axis = 1)

In [58]:
wagering_formatted

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,291212868,31238322,10.7%,31393,Arizona,09,2021
1,486097352,36326542,7.5%,1022841,Arizona,10,2021
2,466725687,51416143,11.0%,3177148,Arizona,11,2021
3,499213733,39822928,8.0%,1723902,Arizona,12,2021
4,563694591,41890200,7.4%,1952971,Arizona,01,2022
...,...,...,...,...,...,...,...
891,7322852,673269,9.2%,38036,Wyoming,06,2022
892,7246352,680740,9.4%,38365,Wyoming,07,2022
893,10990904,1237815,11.3%,85672,Wyoming,08,2022
894,13575173,1932519,14.2%,115500,Wyoming,09,2022


In [59]:
wagering_formatted[wagering_formatted['state']== 'Montana']['taxes/jurisdiction_revenue'].value_counts()

–    32
Name: taxes/jurisdiction_revenue, dtype: int64

https://leg.mt.gov/bills/2019/billpdf/SB0330.pdf
All values are missing, we can calculate them, since according to the law above there is a 8.5% tax rate for the sports betting.

In [60]:
def montana_tax(row):
    if row['state'] == 'Montana':
        return row['revenue']*0.085
    return row['taxes/jurisdiction_revenue']

In [61]:
wagering_formatted['taxes/jurisdiction_revenue'] = wagering_formatted.apply(montana_tax, axis = 1)

Finally we have a look at Washington DC

https://news.bloombergtax.com/daily-tax-report/washington-d-c-s-sports-betting-hobbled-by-unique-problems

For Washington DC we find a 10% tax rate on revenue.

In [62]:
def dc_tax(row):
    if row['state'] == 'Washington DC':
        return row['revenue']*0.1
    return row['taxes/jurisdiction_revenue']

In [63]:
wagering_formatted['taxes/jurisdiction_revenue'] = wagering_formatted.apply(dc_tax, axis = 1)

In [64]:
# Seeking the remaining nonb numerical values
# Np.isreal checks if the argument is a real number, we want all rows where this does not apply
wagering_formatted[~wagering_formatted["taxes/jurisdiction_revenue"].apply(np.isreal)]

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
340,1575859,-62231,-3.9%,,Mississippi,6,2020
425,78152387,-483000,-0.62%,,Nevada,6,2020
550,12872999,-179593,-1.4%,—,New York,2,2020


In [65]:
# As we can see we have negative revenues here, so the tax rate will be 0
for i in [340,425,550]:
    wagering_formatted.iloc[i][["taxes/jurisdiction_revenue"]] = 0


### Cleaning the hold column

Apparently some values have been left as strings, we try to convert everything to numbers.

In [66]:
wagering_formatted.dtypes

handle                        object
revenue                       object
hold                          object
taxes/jurisdiction_revenue    object
state                         object
month                         object
year                          object
dtype: object

In [67]:
def to_int(x):
    return int(x)
def to_float(x):
    return round(float(x),2)

In [68]:
wagering_formatted['handle'] = wagering_formatted['handle'].apply(to_int)

In [69]:
wagering_formatted['revenue'] = wagering_formatted['revenue'].apply(to_int)

In [70]:
wagering_formatted['taxes/jurisdiction_revenue'] = wagering_formatted['taxes/jurisdiction_revenue'].apply(to_int)

In [71]:
wagering_formatted.dtypes

handle                         int64
revenue                        int64
hold                          object
taxes/jurisdiction_revenue     int64
state                         object
month                         object
year                          object
dtype: object

The hold is the ratio of how much of the handle is left as revenue for the company.
The easiest way is just to recalculate the column instead of cleaning it.

In [72]:
wagering_formatted.head(10)

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,291212868,31238322,10.7%,31393,Arizona,9,2021
1,486097352,36326542,7.5%,1022841,Arizona,10,2021
2,466725687,51416143,11.0%,3177148,Arizona,11,2021
3,499213733,39822928,8.0%,1723902,Arizona,12,2021
4,563694591,41890200,7.4%,1952971,Arizona,1,2022
5,491665554,25629835,5.2%,670686,Arizona,2,2022
6,690979294,37231646,5.4%,1864457,Arizona,3,2022
7,512877848,29249275,5.7%,1647450,Arizona,4,2022
8,461450688,55162888,12.0%,4125125,Arizona,5,2022
9,318774198,15369245,4.8%,766831,Arizona,6,2022


In [73]:
wagering_formatted['hold'] = wagering_formatted['revenue']/wagering_formatted['handle']

In [74]:
wagering_formatted.head(10)

Unnamed: 0,handle,revenue,hold,taxes/jurisdiction_revenue,state,month,year
0,291212868,31238322,0.10727,31393,Arizona,9,2021
1,486097352,36326542,0.074731,1022841,Arizona,10,2021
2,466725687,51416143,0.110164,3177148,Arizona,11,2021
3,499213733,39822928,0.079771,1723902,Arizona,12,2021
4,563694591,41890200,0.074314,1952971,Arizona,1,2022
5,491665554,25629835,0.052129,670686,Arizona,2,2022
6,690979294,37231646,0.053882,1864457,Arizona,3,2022
7,512877848,29249275,0.05703,1647450,Arizona,4,2022
8,461450688,55162888,0.119542,4125125,Arizona,5,2022
9,318774198,15369245,0.048214,766831,Arizona,6,2022


In [75]:
wagering_formatted.dtypes

handle                          int64
revenue                         int64
hold                          float64
taxes/jurisdiction_revenue      int64
state                          object
month                          object
year                           object
dtype: object

In [76]:
# For better visibility we round the hold number to two digits.
wagering_formatted['hold'] = wagering_formatted['hold'].apply(to_float)

# Saving

Our main dataframe is now clean. We save it for later use.

In [77]:
# As csv
wagering_formatted.to_csv('Clean Data/wager_clean.csv', index=False)