# Exploratory Data Analysis

Este notebook realiza un análisis exploratorio sobre los datos de Mercado Pago con el objetivo de preparar un dataset para entrenamiento de un modelo de Machine Learning.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configuración visual
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Cargar los datos
input_path = Path("../data/input")
prints = pd.read_json(input_path / "prints.json", lines=True)
taps = pd.read_json(input_path / "taps.json", lines=True)
pays = pd.read_csv(input_path / "pays.csv")

prints.head()


Unnamed: 0,day,event_data,user_id
0,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",98702
1,2020-11-01,"{'position': 1, 'value_prop': 'prepaid'}",98702
2,2020-11-01,"{'position': 0, 'value_prop': 'prepaid'}",63252
3,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",24728
4,2020-11-01,"{'position': 1, 'value_prop': 'link_cobro'}",24728


## Estructura y tipos de datos

In [3]:
print("PRINTS:")
print(prints.info())
print("\nTAPS:")
print(taps.info())
print("\nPAYS:")
print(pays.info())


PRINTS:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508617 entries, 0 to 508616
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   day         508617 non-null  object
 1   event_data  508617 non-null  object
 2   user_id     508617 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 11.6+ MB
None

TAPS:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50859 entries, 0 to 50858
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   day         50859 non-null  object
 1   event_data  50859 non-null  object
 2   user_id     50859 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None

PAYS:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756483 entries, 0 to 756482
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   pay_date    756483 non-null  object 
 1   tot

## Estadísticas básicas

In [4]:
print(prints.describe(include='all'))
print(taps.describe(include='all'))
print(pays.describe())


               day                                         event_data  \
count       508617                                             508617   
unique          30                                                 28   
top     2020-11-07  {'position': 0, 'value_prop': 'cellphone_recha...   
freq         19810                                              29293   
mean           NaN                                                NaN   
std            NaN                                                NaN   
min            NaN                                                NaN   
25%            NaN                                                NaN   
50%            NaN                                                NaN   
75%            NaN                                                NaN   
max            NaN                                                NaN   

              user_id  
count   508617.000000  
unique            NaN  
top               NaN  
freq              NaN  
mea

## Validación de duplicados y nulos

In [7]:
try:
    print("Duplicados:")
    print("PRINTS:", prints.duplicated().sum())
    print("TAPS:", taps.duplicated().sum())
    print("PAYS:", pays.duplicated().sum())

    print("\nValores nulos:")
    print("PRINTS:\n", prints.isnull().sum())
    print("TAPS:\n", taps.isnull().sum())
    print("PAYS:\n", pays.isnull().sum())

except Exception as e:
    print("❌ Error al analizar duplicados o nulos:", e)


Duplicados:
❌ Error al analizar duplicados o nulos: unhashable type: 'dict'


## Distribución de clics y pagos por value_prop

In [6]:
taps['value_prop_id'].value_counts().plot(kind='bar', title='Clicks por Value Prop')
plt.show()

pays['value_prop_id'].value_counts().plot(kind='bar', title='Pagos por Value Prop')
plt.show()


KeyError: 'value_prop_id'

## Comportamiento temporal

In [None]:
prints['timestamp'] = pd.to_datetime(prints['timestamp'])
taps['timestamp'] = pd.to_datetime(taps['timestamp'])
pays['timestamp'] = pd.to_datetime(pays['timestamp'])

prints['timestamp'].dt.date.value_counts().sort_index().plot(title='Eventos de Prints por Día')
plt.show()
