In [1]:
# importamos las librerías que necesitamos API

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Uso de API's
# -----------------------------------------------------------------------
import requests

# Importar librerías para procesamiento de texto
# -----------------------------------------------------------------------
import re

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames=(tablas de excel)

In [2]:
# LEER UN DATAFRAME O ARCHIVO EN GENERAL 
df = pd.read_csv('Tableau_Assessment_Data.xlsx_Tableau_Data.csv')
df.head()

Unnamed: 0,Date,UserID,Brand,New_Registrations,Firsttime_depositors,Depositing_Player,Deposit_Amount,Deposits,Withdrawal_Amount,Withdrawals,Real_money_bets,Bonus_money_bets,Gamerounds,Real_money_win,Bonus_money_win,RowType
0,2/9/2020,116406,Casino C,1,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,Registrations
1,27/6/2020,96659,Casino C,1,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,Registrations
2,10/6/2020,91043,Casino C,1,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,Registrations
3,18/11/2019,25290,Casino C,1,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,Registrations
4,16/5/2020,79115,Casino C,1,0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,Registrations


In [3]:
# EXPLORAR LOS DATAFRAME EN GENERAL 
def exploracion_df(df):
    
    print(' Filas y Columnas del DATAFRAME \n')
    print(f"El número de filas que tenemos es de {df.shape[0]}.\nEl número de columnas es de {df.shape[1]}\n")
    print('____________________________________________________________\n')
    
    print(' Nombre de todas las Columnas del DATAFRAME: \n')
    print(df.columns)
    print('____________________________________________________________\n')
    
    print('INFORMACIÓN GENERAL DEL DATAFRAME \n')
    print(df.info())
    print('____________________________________________________________\n')
    
    print('Ver los NULOS del DataFrame \n')
    print(f'Nulos de todo el data: --> {df.isnull().sum().mean() * 100} \n')
    for columna in df.columns:
        cantidad_valores_nulos = df[columna].isnull().mean() * 100
        print(f'La columna {columna}: {cantidad_valores_nulos}')
    print('____________________________________________________________\n')
    
    print('Valores ÚNICOS por columna:\n')
    for columna in df.columns:
        cantidad_valores_unicos = df[columna].unique()
        print(f'La columna {columna}: {len(cantidad_valores_unicos)}')
        print(f'La columna {columna}: {cantidad_valores_unicos}')
        
    print('____________________________________________________________\n')
    
    print('Valores DUPLICADOS por columna es de:\n')
    for columna in df.columns:
        cantidad_duplicados = df[columna].duplicated().sum()
        print(f'La columna {columna}: {cantidad_duplicados}')
    print('____________________________________________________________\n')
  
    print('--> RESUMEN ESTADÍSTICO \n')
    
    try:
        numeric_summary = df.describe().select_dtypes(include=['number']).T
        if not numeric_summary.empty:
            print('<<< Variables Numéricas >>> \n')
            print(f'{numeric_summary} \n')     
    except:
        print('No hay variables numéricas en el DataFrame.')
    
    try:
        categorical_summary = df.describe(include='object').T
        if not categorical_summary.empty:
            print('<<< Variables Categóricas >>> \n')
            print(f'{categorical_summary} \n')     
    except:
        print('No hay variables categóricas en el DataFrame.')

In [4]:
exploracion_df(df)

 Filas y Columnas del DATAFRAME 

El número de filas que tenemos es de 343797.
El número de columnas es de 16

____________________________________________________________

 Nombre de todas las Columnas del DATAFRAME: 

Index(['Date', 'UserID', 'Brand', 'New_Registrations', 'Firsttime_depositors',
       'Depositing_Player', 'Deposit_Amount', 'Deposits', 'Withdrawal_Amount',
       'Withdrawals', 'Real_money_bets', 'Bonus_money_bets', 'Gamerounds',
       'Real_money_win', 'Bonus_money_win', 'RowType'],
      dtype='object')
____________________________________________________________

INFORMACIÓN GENERAL DEL DATAFRAME 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343797 entries, 0 to 343796
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Date                  343797 non-null  object 
 1   UserID                343797 non-null  int64  
 2   Brand                 343797 non-null  object 


In [5]:
df['UserID'].duplicated().sum()

339379

In [6]:
# Identificar filas duplicadas exactas (todas las columnas iguales)
exact_duplicates = df[df.duplicated(keep=False)]

# Filtrar los duplicados que también tienen el mismo 'UserID'
exact_duplicates_sorted = exact_duplicates.sort_values(by=['UserID'])

# Mostrar el DataFrame resultante
exact_duplicates_sorted.head(20)

Unnamed: 0,Date,UserID,Brand,New_Registrations,Firsttime_depositors,Depositing_Player,Deposit_Amount,Deposits,Withdrawal_Amount,Withdrawals,Real_money_bets,Bonus_money_bets,Gamerounds,Real_money_win,Bonus_money_win,RowType
64056,3/8/2018,4556964,Casino A,0,0,1,20.0,2,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
228604,3/8/2018,4556964,Casino A,0,0,1,20.0,2,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
122782,18/9/2018,4560694,Casino A,0,0,1,10.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
228542,18/9/2018,4560694,Casino A,0,0,1,10.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
222567,1/8/2018,4567688,Casino A,0,0,1,35.0,2,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
222568,4/8/2018,4567688,Casino A,0,0,1,13.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
222569,21/6/2018,4567688,Casino A,0,0,1,11.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
222570,3/8/2018,4567688,Casino A,0,0,1,16.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
338934,1/8/2018,4567688,Casino A,0,0,1,35.0,2,0.0,0,0.0,0.0,0,0.0,0.0,Deposit
338931,3/8/2018,4567688,Casino A,0,0,1,16.0,1,0.0,0,0.0,0.0,0,0.0,0.0,Deposit


In [7]:
len(exact_duplicates_sorted['UserID'].unique())

399

In [8]:
len(exact_duplicates_sorted['UserID'])

6698

------

### `A/B test`

In [16]:
df_AB = pd.read_csv('Statistics_Assessment_Data.xlsx_Statistics_Assessment_Data.csv')
df_AB

Unnamed: 0,UserID,Brand,Test Group,Converted
0,7573340,Casino A,B,1
1,5465339,Casino A,A,0
2,7516520,Casino A,B,0
3,7616973,Casino A,A,1
4,7341455,Casino A,B,0
...,...,...,...,...
346,7597629,Casino A,A,1
347,6059769,Casino A,B,1
348,7369482,Casino A,B,0
349,7467535,Casino A,B,1


In [21]:

# Calculate the conversion rate per test group (A and B)
conversion_summary = df_AB.groupby('Test Group')['Converted'].agg(['sum', 'count'])
conversion_summary['conversion_rate'] = conversion_summary['sum'] / conversion_summary['count']

# Split the data into converters and non-converters
converted = df_AB[df_AB['Converted'] == 1]
non_converted = df_AB[df_AB['Converted'] == 0]

# Calcular la tasa de conversión
conversion_summary['conversion_rate'] = conversion_summary['sum'] / conversion_summary['count']

# Calcular los no convertidos
conversion_summary['no_converted'] = conversion_summary['count'] - conversion_summary['sum']

# Calcular los porcentajes
conversion_summary['converted_percentage'] = (conversion_summary['sum'] / conversion_summary['count']) * 100
conversion_summary['no_converted_percentage'] = (conversion_summary['no_converted'] / conversion_summary['count']) * 100


# Display the results

print("Conversion Summary by Test Group:")
print(conversion_summary)

print("\nConverted Users:")
print(converted)

print("\nNon-Converted Users:")
print(non_converted)


Conversion Summary by Test Group:
            sum  count  conversion_rate  no_converted  converted_percentage  \
Test Group                                                                    
A            82    174         0.471264            92             47.126437   
B            63    177         0.355932           114             35.593220   

            no_converted_percentage  
Test Group                           
A                         52.873563  
B                         64.406780  

Converted Users:
      UserID     Brand Test Group  Converted
0    7573340  Casino A          B          1
3    7616973  Casino A          A          1
6    7339822  Casino A          B          1
9    6478997  Casino A          B          1
13   6614724  Casino A          B          1
..       ...       ...        ...        ...
343  7639431  Casino A          B          1
344  7628662  Casino A          A          1
346  7597629  Casino A          A          1
347  6059769  Casino A       