In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df = pd.read_csv('../dataset/TAB_Betting_Data_cleaned.csv')

print(df.head())

print(df.shape)
     
print(df.isnull().sum())

print(df.drop_duplicates().shape)

df['DATE_DIM']=pd.to_datetime(df['DATE_DIM'])

print(df.shape)

   Unnamed: 0    DATE_DIM DAY_OF_WEEK  BET_ACCOUNT_NUM_HASH   AGE AGE_BAND  \
0           0  2021-01-01         Fri                 13154  67.0      65+   
1           1  2021-01-01         Fri                 18379  54.0    45-54   
2           2  2021-01-01         Fri                559232  63.0    55-64   
3           3  2021-01-01         Fri                698904  69.0      65+   
4           4  2021-01-01         Fri                762921  67.0      65+   

  GENDER  TENURE_IN_DAYS RESIDENTIAL_STATE  FOB_RACING_TURNOVER  \
0      M           11846                WA                 37.0   
1      M            1884                WA                 40.0   
2      M            2866                WA                  0.0   
3      M            2100                WA                  0.0   
4      M            4766                WA                  0.0   

   FOB_SPORT_TURNOVER  PARI_RACING_TURNOVER  PARI_SPORT_TURNOVER  \
0                 0.0                1081.0                 

In [4]:
# EDA
df_customer = df.drop_duplicates(subset='BET_ACCOUNT_NUM_HASH')[['GENDER_FULL', 'AGE_BAND']]

In [5]:
df_customer.shape

(144869, 2)

In [6]:
# Distribution of gender
fig = px.histogram(df_customer, x='GENDER_FULL')
fig.show()

In [5]:
# Distrubution of age
fig = px.histogram(df_customer, x='AGE_BAND')
fig.show()

In [8]:
# Distribution of total turnover through datetime
df_total_turnover = df.groupby(df['DATE_DIM'])['TOTAL_TURNOVER'].sum().reset_index()
top_dates = df_total_turnover.sort_values(by=['TOTAL_TURNOVER'],ascending=False).head(3)
vals = []
for tgl, tot in zip(top_dates["DATE_DIM"], top_dates["TOTAL_TURNOVER"]):
    tgl = tgl.strftime("%d %B")
    val = "%d (%s)"%(tot, tgl)
    vals.append(val)
top_dates['tgl'] = vals
top_dates

Unnamed: 0,DATE_DIM,TOTAL_TURNOVER,tgl
305,2021-11-02,12790451.92,12790451 (02 November)
669,2022-11-01,12011107.99,12011107 (01 November)
687,2022-11-19,8589238.69,8589238 (19 November)


In [13]:
fig = go.Figure(data=go.Scatter(x=np.array(df_total_turnover['DATE_DIM']), 
                                y=df_total_turnover['TOTAL_TURNOVER'],
                                marker_color='black', text="totals"))
fig.update_layout({"title": 'Total Turnover of customer from 2021-01-01 to 2022-12-31',
                   "xaxis": {"title":"Time"},
                   "yaxis": {"title":"Total turnovers"},
                   "showlegend": False})
fig.add_traces(go.Scatter(x=np.array(top_dates['DATE_DIM']), y=top_dates['TOTAL_TURNOVER'],
                          textposition='top left',
                          textfont=dict(color='#233a77'),
                          mode='markers+text',
                          marker=dict(color='red', size=6),
                          text = top_dates["tgl"]))
fig.show()

In [3]:
# Revenue from each betting method
from dash import Dash, dcc, html, Input, Output

list_cate_turnover = ['FOB_RACING_TURNOVER', 'FOB_SPORT_TURNOVER', 'PARI_RACING_TURNOVER', 'PARI_SPORT_TURNOVER']
df_turnover_by_month = df.groupby(df['DATE_DIM'].dt.to_period('M'))[['FOB_RACING_TURNOVER', 'FOB_SPORT_TURNOVER', 'PARI_RACING_TURNOVER', 'PARI_SPORT_TURNOVER']].sum().reset_index()

options = []
for col in list_cate_turnover:
    options.append({'label':'{}'.format(col, col), 'value':col})

app = Dash(__name__)
app.layout = html.Div([
    html.H4('Turnover betting method analysis'),
    dcc.Graph(id="time-series-chart"),
    html.P("Select betting:"),
    dcc.Dropdown(
        id="betting",
    #     options=[
    #    {'label': 'Fixed-odds Racing events', 'value': 'FOB_RACING_TURNOVER'},
    #    {'label': 'Fixed-odds Sports events', 'value': 'FOB_SPORT_TURNOVER'},
    #    {'label': 'Pari-mutuel Racing betting', 'value': 'PARI_RACING_TURNOVER'},
    #    {'label': 'Pari-mutuel Sports betting', 'value': 'PARI_SPORT_TURNOVER'}
    #     ],
        options=options,
        value="FOB_RACING_TURNOVER",
        clearable=False,
    ),
])

@app.callback(
    Output("time-series-chart", "figure"), 
     Input("betting", "value"))

def display_time_series(betting):
    global df_turnover_by_month
    fig = px.line(df_turnover_by_month, x=df_turnover_by_month['DATE_DIM'].astype(dtype=str), y=df_turnover_by_month[betting]).update_layout(
    xaxis_title="Date")
    return fig

app.run_server(debug=True)

In [23]:
# Distribution for total turnover of each betting method
df_turnover = df.groupby(df['DATE_DIM'])[['FOB_RACING_TURNOVER', 'FOB_SPORT_TURNOVER', 'PARI_RACING_TURNOVER', 'PARI_SPORT_TURNOVER']].sum().reset_index()
fig = px.line(df_turnover, x="DATE_DIM", y=df_turnover.columns,
              hover_data={"DATE_DIM": "|%B %d, %Y"},
              title='Total turnover value of each betting method')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y",
    ticklabelmode="period")
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [22]:
# Distribution for number of betting method betted by customer through datetime
count_by_betting = df.groupby(df['DATE_DIM'])[['FOB_RACING_TURNOVER', 'FOB_SPORT_TURNOVER', 'PARI_RACING_TURNOVER', 'PARI_SPORT_TURNOVER']].apply(lambda x:(x>0).sum()).reset_index()
fig = px.line(count_by_betting, x="DATE_DIM", y=count_by_betting.columns,
              hover_data={"DATE_DIM": "|%B %d, %Y"},
              title='Number of betting method')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y",
    ticklabelmode="period")
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

