In [1]:
# Import necessary libraries
import pandas as pd
import zipfile
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import statsmodels.api as sm
from matplotlib.colors import LinearSegmentedColormap

In [8]:
# Path one step above, into data folder
path_files = r'../data/B2C Sales - Data Wrangling and EDA'

zip_files = [f for f in os.listdir(path_files) if f.endswith('.zip')]
# Order the list of zip files by their name
zip_files.sort(reverse=True)

print("Zip files: ")
print(zip_files)


Zip files: 
['VENTAS_ANON_20240615.zip', 'VENTAS_ANON_20240608.zip', 'VENTAS_ANON_20240601.zip', 'VENTAS_ANON_20240525.zip', 'VENTAS_ANON_20240518.zip', 'VENTAS_ANON_20240511.zip', 'VENTAS_ANON_20240504.zip', 'VENTAS_ANON_20240427.zip', 'VENTAS_ANON_20240420.zip', 'VENTAS_ANON_20240413.zip', 'VENTAS_ANON_20240406.zip', 'VENTAS_ANON_20240330.zip', 'VENTAS_ANON_20240323.zip', 'VENTAS_ANON_20240316.zip', 'VENTAS_ANON_20240309.zip']


In [3]:
def extract_csv_from_zips(prefix, path_files, zip_files, verbose=True):
    """
    Extract CSV files starting with `prefix` from a list of ZIP files.

    Returns a single DataFrame with all files combined.
    """
    dataframes = []

    for zip_file in zip_files:
        zip_path = os.path.join(path_files, zip_file)
        with zipfile.ZipFile(zip_path, 'r') as z:
            for filename in z.namelist():
                if filename.startswith(prefix) and filename.endswith('.csv'):
#                    if verbose:
#                        print(f"Processing {filename} from {zip_path}")
                    with z.open(filename) as f:
                        df = pd.read_csv(f, delimiter=';')
                        df['archivo_origen'] = filename 
                        dataframes.append(df)

    if not dataframes:
        print(f"No csv files found starting with '{prefix}'.")
        return pd.DataFrame()
    else:
        df_final = pd.concat(dataframes, ignore_index=True)
        if verbose:
            print(f"{prefix} successfully processed. Total rows: {len(df_final)}")
        return df_final

In [4]:
df_VTA_000 = extract_csv_from_zips("AR_VTA", path_files, zip_files)
df_PRD_000 = extract_csv_from_zips("AR_PRD", path_files, zip_files)
df_PDV_000 = extract_csv_from_zips("AR_PDV", path_files, zip_files)

AR_VTA successfully processed. Total rows: 3615562
AR_PRD successfully processed. Total rows: 32420
AR_PDV successfully processed. Total rows: 27297


In [5]:
df_VTA_000.head()

Unnamed: 0,Semana_Inicio_Semana,Codigo_Unico_PDV,PDV_Comparables,Codigo_Barras_SKU,Categoria,Cantidad_Contenido_SKU,Cantidad_de_Venta,Precio_por_Unidade,archivo_origen
0,20240518,ID0460,1,ID0091,ID0004,400.0,8,2170992,AR_VTA_20240615.csv
1,20240518,ID0371,1,ID0566,ID0002,230.0,9,1350000,AR_VTA_20240615.csv
2,20240518,ID0166,0,ID0566,ID0002,230.0,26,3957543,AR_VTA_20240615.csv
3,20240511,ID0446,1,ID0260,ID0001,110.0,1,100000,AR_VTA_20240615.csv
4,20240525,ID0797,1,ID0093,ID0001,160.0,3,567000,AR_VTA_20240615.csv


In [6]:
df_PRD_000.head()

Unnamed: 0,PRD_CODIGO,CATEGORIA_SKU,PROVEEDOR_SKU,MARCA_SKU,NOMBRE_SKU,CODIGO_BARRAS_SKU,PROD_CANT_CONTENIDO,archivo_origen
0,ID1138,ID0002,ID0002,ID0002,ID1128,ID1138,500.0,AR_PRD_20240615.csv
1,ID1139,ID0002,ID0002,ID0002,ID1129,ID1139,100.0,AR_PRD_20240615.csv
2,ID1140,ID0007,ID0056,ID0103,ID2110,ID1140,400.0,AR_PRD_20240615.csv
3,ID1279,ID0004,ID0007,ID0010,ID0774,ID0781,450.0,AR_PRD_20240615.csv
4,ID1746,ID0001,ID0002,ID0002,ID1221,ID1232,6.0,AR_PRD_20240615.csv


In [7]:
df_PDV_000.head()

Unnamed: 0,CODIGO_PDV,CODIGO_UNICO_PDV,NOMBRE_PDV,DIRECCION_PDV,COMPARABLES_HOY,AREA_P,RUC,ZONA_MODELO_B,CLUSTER_PDV,ESTADO_PDV,LOCALIDAD_PDV,ZONA_P,archivo_origen
0,ID0778,ID0778,ID0776,ID0770,0,Interior,30711790000.0,CUYO,,MENDOZA,General Alvear,MENDOZA INTERIOR,AR_PDV_20240615.csv
1,ID0747,ID0747,ID0745,ID0739,0,Interior,30709060000.0,CUYO,,MENDOZA,General Alvear,MENDOZA INTERIOR,AR_PDV_20240615.csv
2,ID0026,ID0026,ID0026,ID0026,0,Interior,27208540000.0,LITORAL,,SANTA FE,Rosario,ROSARIO,AR_PDV_20240615.csv
3,ID0027,ID0027,ID0027,ID0027,0,Metropolitana,30712030000.0,SUB SUR,,BUENOS AIRES,La Plata,LA PLATA,AR_PDV_20240615.csv
4,ID0029,ID0029,ID0029,ID0029,0,Interior,23269080000.0,CUYO,Autoservicio Chico,MENDOZA,MAIPU,MENDOZA INTERIOR,AR_PDV_20240615.csv
