In [1]:
import os
import json
import pandas as pd
from datetime import datetime
import pandasql as ps

In [2]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [3]:
CURRENT_WORKING_DIRECTORY = os.getcwd()
INPUTS = CURRENT_WORKING_DIRECTORY + "\\inputs"
SEPARATOR=";"
PLAYERS = "Players.csv"
SELFBANS = "Selfbans.csv"
TRANSACTIONS = "Transactions.csv"

In [4]:
CONFIG_FILE = "config.json"
with open(CURRENT_WORKING_DIRECTORY + "\\" + CONFIG_FILE, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [5]:
PLAYERS_DATES = CONFIG["PLAYERS"]["DATES"]
PLAYERS_SEPARATOR = CONFIG["PLAYERS"]["SEPARATOR"]
PLAYERS_DTYPE = CONFIG["PLAYERS"]["DTYPE"]

In [6]:
SELFBANS_DATES = CONFIG["SELFBANS"]["DATES"]
SELFBANS_SEPARATOR = CONFIG["SELFBANS"]["SEPARATOR"]
SELFBANS_DTYPE = CONFIG["SELFBANS"]["DTYPE"] 

In [7]:
TRANSACTIONS_DATES = CONFIG["TRANSACTIONS"]["DATES"]
TRANSACTIONS_SEPARATOR = CONFIG["TRANSACTIONS"]["SEPARATOR"]
TRANSACTIONS_DTYPE = CONFIG["TRANSACTIONS"]["DTYPE"] 

In [8]:
def myparser(x):
    """
    """
#     try:
#         datetime.utcfromtimestamp(int(x))
#     except:
#         return pd.NaT
#    return dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return datetime.utcfromtimestamp(int(x))

## PLAYERS

In [9]:
players = pd.read_csv(
    INPUTS + "\\" + PLAYERS,
    delimiter=PLAYERS_SEPARATOR,
    parse_dates=True,
    infer_datetime_format=True
#     parse_dates=PLAYERS_DATES,
#     dtype=PLAYERS_DTYPE,
#     date_parser=myparser
)

In [10]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   AccountId          1000 non-null   object
 1   PlayerNr           1000 non-null   int64 
 2   Country            1000 non-null   object
 3   DateOfBirth        1000 non-null   object
 4   Source             1000 non-null   object
 5   Selfban_start      24 non-null     object
 6   Selfban_end        24 non-null     object
 7   First_genre        1000 non-null   object
 8   Registration_date  1000 non-null   object
 9   Currency           759 non-null    object
dtypes: int64(1), object(9)
memory usage: 78.2+ KB


In [11]:
pd.set_option('display.max_rows', 20) # or 1000

In [12]:
players

Unnamed: 0,AccountId,PlayerNr,Country,DateOfBirth,Source,Selfban_start,Selfban_end,First_genre,Registration_date,Currency
0,3s85cau8v9,3887581556,PL,1967-12-13,newspaper,,,Games,2017-07-06,
1,77tetovrax,7847705450,PL,1975-06-03,newspaper,,,Scratchard,2013-09-07,
2,9if26dd8is,7005121609,SE,1975-06-26,tv,,,Sportsbook,2019-02-14,SEK
3,hae7koqlzm,8437991648,PL,1964-10-07,facebook,,,Bingo,2020-05-03,
4,ye9qkt7b35,3609218616,AT,1980-12-28,newspaper,,,Bingo,2019-05-30,EUR
...,...,...,...,...,...,...,...,...,...,...
995,ibi9p03lja,6482640039,ES,1977-06-30,newspaper,,,Scratchard,2019-04-09,EUR
996,qn24k3kt2m,7360398683,RO,1943-04-29,direct,,,Scratchard,2020-12-01,
997,z39mcqia8s,6877956071,DE,2000-10-17,direct,,,Lottery,2016-02-06,EUR
998,gznrfd648a,5076578596,SE,1983-01-25,direct,,,Scratchard,2016-02-15,SEK


In [13]:
# players["AccountId"].unique()

In [14]:
players["AccountId"].nunique()

1000

In [15]:
# players["PlayerNr"].unique()

In [16]:
players["PlayerNr"].nunique()

1000

In [17]:
players["Source"].unique()

array(['newspaper', 'tv', 'facebook', 'web', 'direct'], dtype=object)

In [18]:
players["Source"].nunique()

5

In [19]:
players["First_genre"].unique()

array(['Games', 'Scratchard', 'Sportsbook', 'Bingo', 'Lottery'],
      dtype=object)

In [20]:
players["First_genre"].nunique()

5

In [21]:
players["Currency"].unique()

array([nan, 'SEK', 'EUR', 'NZD'], dtype=object)

In [22]:
players["Currency"].nunique()

3

## SELFBANS

In [23]:
selfbans = pd.read_csv(
    INPUTS + "\\" + SELFBANS,
    delimiter=SELFBANS_SEPARATOR,
    parse_dates=True,
    infer_datetime_format=True
#     dtype=SELFBANS_DTYPE,
#     parse_dates=SELFBANS_DATES,
#     date_parser=myparser
)

In [24]:
selfbans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   AccountId      95 non-null     object
 1   Selfban_start  95 non-null     object
 2   Selfban_end    95 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB


In [25]:
selfbans.head()

Unnamed: 0,AccountId,Selfban_start,Selfban_end
0,eq0mordsce,2014-10-01,2020-07-12
1,g6e5wzy89r,2021-08-03,2021-09-02
2,fkl9appqov,2021-07-26,2021-08-28
3,z13kx2qaaa,2019-09-14,2021-02-01
4,aqnzj0cak4,2013-09-22,2016-12-02


In [26]:
# selfbans["AccountId"].unique()

In [27]:
selfbans["AccountId"].nunique()

81

In [28]:
# selfbans["Selfban_start"].unique()

In [29]:
selfbans["Selfban_start"].nunique()

90

In [30]:
# selfbans["Selfban_end"].unique()

In [31]:
selfbans["Selfban_end"].nunique()

89

## TRANSACTIONS

In [32]:
transactions = pd.read_csv(
    INPUTS + "\\" + TRANSACTIONS,
    delimiter=TRANSACTIONS_SEPARATOR,
    parse_dates=True,
    infer_datetime_format=True
#     dtype=TRANSACTIONS_DTYPE,
#     parse_dates=TRANSACTIONS_DATES,
#     date_parser=myparser
)

In [33]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49250 entries, 0 to 49249
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PlayerNr        49250 non-null  int64  
 1   Trx_Type        49250 non-null  object 
 2   Trx_time        49250 non-null  object 
 3   Balance_before  49250 non-null  float64
 4   Balance_after   49250 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.9+ MB


In [34]:
transactions.head(3)

Unnamed: 0,PlayerNr,Trx_Type,Trx_time,Balance_before,Balance_after
0,3887581556,bonus,2017-07-24 01:24:24,0,796
1,3887581556,stake,2017-09-07 17:11:17,796,548
2,3887581556,withdrawal,2017-09-21 01:57:25,548,367


In [35]:
# transactions["PlayerNr"].unique()

In [36]:
transactions["PlayerNr"].nunique()

994

In [37]:
transactions["Trx_Type"].unique()

array(['bonus', 'stake', 'withdrawal', 'win', 'deposit'], dtype=object)

In [38]:
transactions["Trx_Type"].nunique()

5

In [39]:
transactions["Trx_time"].unique()

array(['2017-07-24 01:24:24', '2017-09-07 17:11:17',
       '2017-09-21 01:57:25', ..., '2021-11-02 21:37:09',
       '2021-12-07 21:12:39', '2022-01-26 14:38:41'], dtype=object)

In [40]:
transactions["Trx_time"].nunique()

49242

## Questions:
#### 1. What was the average deposit in Spain in 2021?

In [41]:
q3 = """
WITH players_spain AS (
SELECT DISTINCT
    p.PlayerNr,
    p.country
FROM players as p
WHERe Country ="ES"

), balance AS (
    SELECT 
            PlayerNr,
            SUM(Balance_after- Balance_before) as sum_deposite
            FROM transactions
            WHERE Trx_Type="deposit"
            GROUP BY PlayerNr
            
)

SElECT
--ps.country,
AVG(b.sum_deposite) as average_deposit_spain
FROM players_spain as ps
INNER JOIN balance as b
ON ps.PlayerNr = b.PlayerNr
"""

In [42]:
q3_result = ps.sqldf(q3, locals())

In [43]:
q3_result

Unnamed: 0,average_deposit_spain
0,876


#### 2. Show Top 10 depositor and their deposits.

#### 3. How many active players there are?

#### 4. How many players currently have self-ban active?

#### 5. How many self-bans were activated in 2021?

#### 6. How many players had active self-ban on 1st of May 2021?