In [1]:
import polars as pl
from io import StringIO
import requests
import s3fs


def download_files(table_name: str) -> pl.DataFrame:
    url = f"https://bit.ift.org.mx/descargas/datos/tabs/{table_name}.csv"
    try:
        response = requests.get(url)
        response.raise_for_status()
        
    except requests.exceptions.RequestException as e:
        print(f"Error inesperado: {e}")
    
    response.encoding = 'utf-8'
    csv_string = StringIO(response.text)
    
    return pl.read_csv(csv_string,infer_schema_length=10000)
    
def tweak_df(df: pl.DataFrame, columns_transformations: list, schema: dict) -> pl.DataFrame:
  # aplicación de funciones para el df
  # acepta lista de expresiones
  return df.with_columns(columns_transformations).cast(schema)

def dict_to_df(dict):
    return pl.from_dict(dict)

def upload_df(df: pl.DataFrame, bucket_name: str, file_name: str):
    fs = s3fs.S3FileSystem()
    destination = f"s3://{bucket_name}/{file_name}.parquet"
    try:
        with fs.open(destination, mode = 'wb') as f:
            df.write_parquet(f)
    except Exception as e:
        print(f"Error inesperado: {e}")

In [2]:
diccionario_datos = {
    "TD_LINEAS_INTMOVIL_ITE_VA":{
        "schema":{
            "FECHA": pl.Date,
            "ANIO": pl.Int16,
            "MES": pl.Int8,
            # "K_GRUPO": pl.Categorical,
            #"GRUPO": pl.Categorical,
            #"K_EMPRESA": pl.Categorical,
            #"EMPRESA": pl.Categorical,
            #"CONCESIONARIO": pl.Categorical,
            "L_PREPAGO_E": pl.Int32,
            "L_POSPAGO_E": pl.Int32,
            "L_POSPAGOC_E": pl.Int32,
            "L_POSPAGOL_E": pl.Int32,
            "L_NO_ESPECIFICADO_E": pl.Int64,
            "L_TOTAL_E": pl.Int32,
            #"FOLIO": pl.Categorical
          },
         "tweak_columns":[
            #pl.col( "L_PREPAGO_E" ).str.replace_all(",","").str.to_integer(base=10),
            #pl.col( "L_POSPAGO_E" ).str.replace_all(",","").str.to_integer(base=10),
            #pl.col( "L_POSPAGOC_E" ).str.replace_all(",","").str.to_integer(base=10),
            #pl.col( "L_POSPAGOL_E" ).str.replace_all(",","").str.to_integer(base=10),
            #pl.col( "L_NO_ESPECIFICADO_E" ).str.replace_all(",","").str.to_integer(base=10),
            #pl.col( "L_TOTAL_E" ).str.replace_all(",","").str.to_integer(base=10),
            pl.col("FECHA").str.to_date("%d/%m/%Y"),
            pl.col("FOLIO").cast(pl.Int32)
          ]
    },
    "TD_TRAF_INTMOVIL_ITE_VA":{
        "schema":{
            "ANIO":pl.Int16,
            "MES":pl.Int8,
            "FECHA":pl.Date,
            #"FOLIO":pl.Categorical,
            ## "K_GRUPO":pl.Categorical,
            #"GRUPO":pl.Categorical,
            #"K_EMPRESA":pl.Categorical,
            #"EMPRESA":pl.Categorical,
            #"CONCESIONARIO":pl.Categorical,
            "TRAF_TB_2G_E":pl.Float32,
            "TRAF_TB_3G_E":pl.Float32,
            "TRAF_TB_4G_E":pl.Float32,
            "TRAF_TB_NO_ESPECIFICADO_E":pl.Float32,
            "TOTAL_TB_E":pl.Float32
        },
        "tweak_columns":[
            #pl.col('TRAF_TB_2G_E').cast(pl.Float32),
            #pl.col('TRAF_TB_3G_E').str.replace_all(",","").cast(pl.Float32),
            #pl.col('TRAF_TB_4G_E').str.replace_all(",","").cast(pl.Float32),
            #pl.col('TRAF_TB_NO_ESPECIFICADO_E').str.replace_all(",","").cast(pl.Float32),
            #pl.col('TOTAL_TB_E').str.replace_all(",","").cast(pl.Float32),
            pl.col('FOLIO').cast(pl.Int32),
            pl.col("FECHA").str.to_date("%d/%m/%Y")
            #pl.coalesce(pl.col('FECHA').str.to_date(format="%d%b%Y",strict=False),pl.col('FECHA').str.to_date(format="%d-%b-%y",strict=False))
            ]
    },
    "TD_IHH_INTMOVIL_ITE_VA":{
        "schema":{
            "ANIO":pl.Int16,
            "MES":pl.Int8,
            "IHH_INTMOVIL_E":pl.Int16
        },
        "tweak_columns": [
            pl.col('IHH_INTMOVIL_E').str.replace_all(",",""),
            pl.col("FECHA").str.to_date("%d/%m/%Y")
        ]
    },
    "TD_MARKET_SHARE_INTMOVIL_ITE_VA":{
        "schema": {
            "ANIO":pl.Int16,
            "MES":pl.Int8,
            # "K_GRUPO":pl.Categorical,
            #"GRUPO":pl.Categorical
        },
        "tweak_columns":[
            pl.col('MARKET_SHARE').str.replace_all("%","").cast(pl.Float32),
            pl.col('FECHA').str.to_date(format="%d/%m/%Y")
        ]

    }
}

In [10]:
diccionario_datos.keys()

dict_keys(['TD_LINEAS_INTMOVIL_ITE_VA', 'TD_TRAF_INTMOVIL_ITE_VA', 'TD_IHH_INTMOVIL_ITE_VA', 'TD_MARKET_SHARE_INTMOVIL_ITE_VA'])

In [3]:
def tablas(name_tabla):
    name = name_tabla
    tabla= download_files(name)
    tabla_tw = tweak_df(tabla,diccionario_datos[name]['tweak_columns'],diccionario_datos[name]['schema'])
    return tabla_tw

In [4]:
lineas = tablas('TD_LINEAS_INTMOVIL_ITE_VA')
traf = tablas('TD_TRAF_INTMOVIL_ITE_VA')
ihh = tablas('TD_IHH_INTMOVIL_ITE_VA')
market_share = tablas('TD_MARKET_SHARE_INTMOVIL_ITE_VA')

In [22]:
print(f'unique values of lineas: {len(lineas["K_GRUPO"].value_counts())}')
print(f'unique values of traf: {len(traf["K_GRUPO"].value_counts())}')
print(f'unique values of participacion_mercado: {len(market_share["K_GRUPO"].value_counts())}')
print(f'unique values of ihh: {len(market_share["K_GRUPO"].value_counts())}')

unique values of lineas: 61
unique values of traf: 30
unique values of participacion_mercado: 61
unique values of ihh: 61


Ok vamos a utilizar la variable K_GRUPO como la variable llave

In [23]:
market_share

ANIO,MES,FECHA,K_GRUPO,GRUPO,MARKET_SHARE,Unnamed: 6_level_0
i16,i8,date,str,str,f32,str
2010,6,2010-06-15,"""G005""","""IUSACELL-UNEF�N""",5.01,
2010,6,2010-06-15,"""G003""","""TELEF�NICA""",6.28,
2010,6,2010-06-15,"""G006""","""AM�RICA M�VIL""",88.709999,
2010,9,2010-09-15,"""G003""","""TELEF�NICA""",6.47,
2010,9,2010-09-15,"""G006""","""AM�RICA M�VIL""",86.550003,
…,…,…,…,…,…,…
2023,12,2023-12-15,"""C709""",""" FREEDOM """,1.72,
2023,12,2023-12-15,"""G003""",""" TELEF�NICA """,7.65,
2023,12,2023-12-15,"""C804""",""" GRUPO WALMART """,9.46,
2023,12,2023-12-15,"""G007""",""" AT&T """,14.61,


In [12]:
sample_market_share = market_share.select('FECHA','K_GRUPO','MARKET_SHARE')

In [5]:
import hvplot.polars

In [None]:
df_polars.hvplot.scatter(x='bill_length_mm', y='bill_depth_mm', by='species')

In [None]:
first_plot = market_share.hvplot()

In [6]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta

# Seed the random number generator for reproducibility
np.random.seed(1)

# Generate the date range
start_date = datetime(2000, 1, 1)
date_range = [start_date + timedelta(days=i) for i in range(1000)]

# Generate the random data and create a DataFrame
data = np.random.randn(1000, 4).cumsum(axis=0)

# Create the Polars DataFrame
df = pl.DataFrame({
    'date': date_range,
    'A': data[:, 0],
    'B': data[:, 1],
    'C': data[:, 2],
    'D': data[:, 3],
})

# Display the first two rows
df.head(2)


date,A,B,C,D
datetime[μs],f64,f64,f64,f64
2000-01-01 00:00:00,1.624345,-0.611756,-0.528172,-1.072969
2000-01-02 00:00:00,2.489753,-2.913295,1.21664,-1.834176


In [7]:
first_plot = df.hvplot()

In [8]:
first_plot

In [17]:
plot1 = sample_market_share.select('FECHA','MARKET_SHARE').hvplot.line()


In [18]:
plot1

In [16]:
sample_market_share

FECHA,K_GRUPO,MARKET_SHARE
date,str,f32
2010-06-15,"""G005""",5.01
2010-06-15,"""G003""",6.28
2010-06-15,"""G006""",88.709999
2010-09-15,"""G003""",6.47
2010-09-15,"""G006""",86.550003
…,…,…
2023-12-15,"""C709""",1.72
2023-12-15,"""G003""",7.65
2023-12-15,"""C804""",9.46
2023-12-15,"""G007""",14.61


In [27]:
plot1 = sample_market_share.hvplot()
plot1

In [28]:
aggregated_sample_market_share = sample_market_share.select('FECHA','K_GRUPO','MARKET_SHARE').group_by('FECHA','K_GRUPO').agg(pl.col('MARKET_SHARE').sum())

In [34]:
plot2 = aggregated_sample_market_share.hvplot()
plot2

In [38]:
plot1 = aggregated_sample_market_share.hvplot.line(x ='FECHA',y='MARKET_SHARE',by='K_GRUPO')

In [39]:
plot1

In [41]:
plot2 = aggregated_sample_market_share.hvplot.bar(x='K_GRUPO',y='MARKET_SHARE')
plot2