# 1. Exploratory data analysis

## Goals
- Be in a data-driven mindset
- Train my data-analysis skills
- Develop a modelling strategy


## Basic Analysis
### Dataset description
- **target variable** : "moderator" column (if a moderator played the round)
- **lines and columns** : 55 575 rows x 10 columns
- **data types** : 6 continuous values, 1 discrete value (bool), 1 date and 1 id like style
- **missing values** : 10 missing rows for col "time", mainly when there is no player in the room.

Column "ID" will likely be ignored for the rest of the analysis.

### In-depth analysis

## Librairies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Dataset

In [14]:
casino_df = pd.read_csv('onlineCasino.csv', parse_dates=['time'])
casino_df

Unnamed: 0,ID,gamers,skins,money,ticks,peopleWin,peopleLost,outpay,time,moderator
0,2091103,144,174,283.57000,14.30,125.459984,0.250000,408.780000,2021-08-25 16:43:00,False
1,2091104,134,182,279.30000,1.14,5.810003,177.040000,108.070010,2021-08-25 16:44:00,False
2,2091105,139,179,282.87000,3.91,125.010000,5.960000,401.919920,2021-08-25 16:44:00,False
3,2091106,139,169,271.44000,1.15,6.900001,181.850000,96.490005,2021-08-25 16:45:00,False
4,2091107,142,177,304.88000,1.00,0.000000,304.880000,0.000000,2021-08-25 16:45:00,False
...,...,...,...,...,...,...,...,...,...,...
55570,2146673,114,146,246.68002,1.74,46.410004,36.400000,226.020020,2021-09-14 22:01:00,False
55571,2146674,106,131,190.87997,2.35,49.629974,14.489999,156.730010,2021-09-14 22:02:00,False
55572,2146675,110,138,209.81992,1.40,23.020000,76.109985,219.819980,2021-09-14 22:02:00,False
55573,2146676,101,128,165.06999,5.76,56.540000,1.790000,202.730030,2021-09-14 22:03:00,False


In [15]:
casino_df.dtypes.value_counts()

float64           5
int64             3
datetime64[ns]    1
bool              1
dtype: int64

## Missing values

In [24]:
casino_df.isna().sum()

ID             0
gamers         0
skins          0
money          0
ticks          0
peopleWin      0
peopleLost     0
outpay         0
time          10
moderator      0
dtype: int64

In [22]:
casino_df[casino_df['time'].isna()]

Unnamed: 0,ID,gamers,skins,money,ticks,peopleWin,peopleLost,outpay,time,moderator
5522,2096625,0,0,0.0,2.85,0.0,0.0,0.0,NaT,False
5864,2096967,0,0,0.0,23.89,0.0,0.0,0.0,NaT,False
6762,2097865,0,0,0.0,1.17,0.0,0.0,0.0,NaT,False
7155,2098258,0,0,0.0,5.62,0.0,0.0,0.0,NaT,False
8005,2099108,0,0,0.0,1.08,0.0,0.0,0.0,NaT,False
8006,2099109,0,0,0.0,10.19,0.0,0.0,0.0,NaT,False
8186,2099289,0,0,0.0,26.78,0.0,0.0,0.0,NaT,False
12920,2104023,38,46,70.47001,8.08,25.929998,0.0,0.0,NaT,False
18782,2109885,26,31,55.059994,1.1,0.9,39.98,0.0,NaT,False
45427,2136530,3,6,7.5,1.57,1.71,0.0,0.0,NaT,False


## Numerical analysis

In [4]:
casino_df.describe()

Unnamed: 0,ID,gamers,skins,money,ticks,peopleWin,peopleLost,outpay
count,55575.0,55575.0,55575.0,55575.0,55575.0,55575.0,55575.0,55575.0
mean,2118890.0,123.028754,158.333495,285.020029,12.031466,66.642813,79.308366,272.35041
std,16043.26,26.145191,30.969729,195.503233,240.496913,116.713177,186.749071,221.625618
min,2091103.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,2104996.0,102.0,135.0,215.475005,1.3,19.780007,3.15,158.399975
50%,2118890.0,121.0,155.0,255.01003,1.95,58.359997,30.67,266.61005
75%,2132784.0,137.0,175.0,308.40994,3.89,97.999977,111.685007,361.93507
max,2146677.0,491.0,552.0,5696.327,23522.65,21681.592,5696.327,25943.201


In [None]:
def plot_violon(x, hue):
    plt.fig

In [13]:
px.violin(casino_df,
    x='money',
    color='moderator',
    box=True,
    points='all')

In [5]:
px.violin(casino_df,
    x='skins',
    color='moderator',
    box=True,
    points='all')

## Feature engineering

In [27]:
outcome_df = casino_df.copy()
outcome_df['main_outcome'] = casino_df['peopleWin'] > casino_df['peopleLost']
outcome_df

Unnamed: 0,ID,gamers,skins,money,ticks,peopleWin,peopleLost,outpay,time,moderator,main_outcome
0,2091103,144,174,283.57000,14.30,125.459984,0.250000,408.780000,2021-08-25 16:43:00,False,True
1,2091104,134,182,279.30000,1.14,5.810003,177.040000,108.070010,2021-08-25 16:44:00,False,False
2,2091105,139,179,282.87000,3.91,125.010000,5.960000,401.919920,2021-08-25 16:44:00,False,True
3,2091106,139,169,271.44000,1.15,6.900001,181.850000,96.490005,2021-08-25 16:45:00,False,False
4,2091107,142,177,304.88000,1.00,0.000000,304.880000,0.000000,2021-08-25 16:45:00,False,False
...,...,...,...,...,...,...,...,...,...,...,...
55570,2146673,114,146,246.68002,1.74,46.410004,36.400000,226.020020,2021-09-14 22:01:00,False,True
55571,2146674,106,131,190.87997,2.35,49.629974,14.489999,156.730010,2021-09-14 22:02:00,False,True
55572,2146675,110,138,209.81992,1.40,23.020000,76.109985,219.819980,2021-09-14 22:02:00,False,False
55573,2146676,101,128,165.06999,5.76,56.540000,1.790000,202.730030,2021-09-14 22:03:00,False,True
