In [None]:
import polars as pl

In [None]:
'''
Campo Descripción Tipo de Dato
serialtarjeta: Serial de la Tarjeta que identifica al pasajero hash
idsam: Identificacion del bus de transporte público hash
fechahoraevento: timestamp del inicio del viaje timestamp
producto: Tipo de Producto string
montoevento: Monto del evento descontado de la tarjeta entero
consecutivoevento: Consecutivo del evento entero
identidad: Identidad entero
tipoevento: Tipo de Evento entero
latitude: Latitud geografica punto flotante
longitude: Longitud geografica punto flotante
idrutaestacion: Linea o ruta utilizada string
tipotransporte: tipo de transporte entero
'''

# Load the first 1 million rows of a CSV file with ";" as separator
df = pl.read_csv('merged.csv', separator=';', n_rows=1000000, try_parse_dates=True)
# Display the first 10 rows of the DataFrame
df.head(10)

In [None]:
# Display summary statistics for all numeric columns
df.describe()

---

In [None]:
df2 = pl.read_csv('merged.csv', separator=';', try_parse_dates=True, columns=['serialtarjeta', 'fechahoraevento'])
df2.describe()

In [None]:
df2 = df2.with_column(pl.col("fechahoraevento").dt.weekday().alias("weekday"))
df2 = df2.with_column(pl.col("fechahoraevento").dt.hour().alias("hour"))

grouped_df = df2.groupby(["weekday", "hour"]).agg([pl.count("fechahoraevento").alias("dia_hora_evento_count")])
grouped_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming your grouped DataFrame is named `grouped_df`
# and has columns named `weekday`, `hour`, and `fechahoraevento_count`
grouped_df_pd = grouped_df.to_pandas()

# Map weekday numbers to names
weekday_map = {1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday", 5: "Friday", 6: "Saturday", 7: "Sunday"}
grouped_df_pd["weekday_name"] = grouped_df_pd["weekday"].map(weekday_map)

# Specify the order of the categories
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
hour_order = list(range(24))

In [None]:
import plotly.express as px
import plotly.io as pio

fig = px.bar(data_frame=grouped_df_pd, x="weekday_name", y="dia_hora_evento_count", color="hour", category_orders={"weekday_name": weekday_order, "hour": hour_order}, barmode='group')
fig.show()
# save to html
fig.write_html("dia_hora_evento_count.html")

In [None]:
# Create a distribution plot of the serialtarjeta column
fig = px.histogram(df2, x='serialtarjeta', nbins=100, title='Distribution of serialtarjeta')

# Show the plot
fig.show()