# Libraries

In [1]:
from datetime import datetime
import pandas as pd
import os
pd.io.formats.excel.ExcelFormatter.header_style = None

In [2]:
# Ignore rows with no data
def get_data(data, column):
    df = data.copy()
    not_null = df[df[column].notnull()].index.min()
    df.columns = df.loc[not_null].tolist()
    df = df[not_null + 1:].copy()
    df.reset_index(drop=True, inplace=True)
    return df

In [3]:
def filter_clients(df):
    data = pd.DataFrame()
    clients = ['LOCATEL',  'OLIMPICA', 'COOPIDROGAS', 'UNIDROGAS', 'INTERNACIONAL', 'CRUZ VERDE', 'AXA',
               'ETICOS', 'MULTIDROGAS', 'CENCOSUD', 'ALKOSTO', 'ROMA', 'JULIO DELGADO', 'BRECCIA', 'EVE',
               'FARMATODO', 'EXITO', 'DPJD', 'PASTEUR', 'COPSERVIR', 'CAFAM', 'COLOMBIANA DE COMERCIO']
    for client in clients:
        aux = df[df['Cliente'].str.contains(client)]
        data = pd.concat([data, aux])
        data.reset_index(drop=True, inplace=True)
    return data

In [4]:
def clean_ean(df):
    df = df.sort_values(['Fecha', 'Cliente'], ascending=False)
    try:
        df['EAN'] = df['EAN'].map(int)
    except:
        df['EAN'].fillna(0, inplace=True)
        df['EAN'] = df['EAN'].map(int)
    df.reset_index(drop=True, inplace=True)
    return df

In [5]:
def get_unique_sku(df):
    final = pd.DataFrame()
    for row in df[['Cliente', 'EAN']].drop_duplicates().itertuples():
        aux = df.loc[(df['Cliente']==row.Cliente)&(df['EAN']==row.EAN)]
        date = aux['Fecha'].max()
        if aux[aux['Fecha'] == date].shape[0] > 1:
            aux = aux[(aux['Fecha'] == date)&(~aux['Producto'].str.contains('NF'))]
        final = pd.concat([final, aux[aux['Fecha'] == date]])
        final.reset_index(drop=True, inplace=True)
    return final

In [6]:
def export_file(df, path, filename):
    df.to_excel(path + filename, sheet_name='Precios', index=False)

path ='../../../02PriceUpdate/COL/1Data/2Catalogue/1Current/'
[f for f in os.listdir(path) if 'LISTA DE PRECIOS' in f]

files = ['LISTA DE PRECIOS 010321 + DURACELL.xlsx',
         'LISTA DE PRECIOS 020221 + DURACELL.xlsx',
         'LISTA DE PRECIOS 09122020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 11062020 + DURACELL-1.xlsx',
         'LISTA DE PRECIOS 120121 + DURACELL.xlsx',
         'LISTA DE PRECIOS 14042020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 14072020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 160421 + DURACELL.xlsx',
         'LISTA DE PRECIOS 18112020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 21042020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 21092020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 22062020 + DURACELL-1.xlsx',
         'LISTA DE PRECIOS 22122020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 240521 + DURACELL.xlsx',
         'LISTA DE PRECIOS 26032020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 27082020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 27102020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 28052020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 28072020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 30092020 + DURACELL.xlsx',
         'LISTA DE PRECIOS 30112020 + DURACELL.xlsx']

In [11]:
path_prices = r'C:\Users\jshernandezm\OneDrive - genommalabinternacional\Colombia\Price Update\Data'
files = [f for f in os.listdir(path_prices) if 'LISTA DE PRECIOS' in f]

# Import Data

In [13]:
%%time
df = pd.DataFrame()
for file in files:
    try:
        aux = pd.read_excel(path_prices + '\\' + file, sheet_name='Lista de Precios')
    except:
        aux = pd.read_excel(path_prices + '\\' + file, sheet_name='Hoja1')
    
    if 'Unnamed: 7' in aux.columns:
        aux = get_data(aux, 'Unnamed: 7')
    
    date = file.split(' + ')[0].split(' ')[3]
    if len(date) == 6:
        date = date[:4] + '2021'
        
    aux.rename({'ean':'EAN', 'Ean':'EAN'}, axis=1, inplace=True)
    aux['Fecha'] = datetime.strptime(date, '%d%m%Y')
    df = pd.concat([df, aux[['Fecha', 'Cliente', 'EAN', 'Producto', 'LISTA']]])

Wall time: 5min 4s


# Data Cleaning

- Quitar duplicados por Descripción de productos
    - NUEVA IMAGEN <- Productos a quedarse
- Agregar Columna ID

In [14]:
%%time
data = filter_clients(df)
data = clean_ean(data)
final = get_unique_sku(data)

Wall time: 1min 22s


## Delete repetead products

In [15]:
final.drop_duplicates(inplace=True)

In [16]:
final['ID'] = final['Cliente'] + final['EAN'].map(int).map(str)
repeated_skus  = final['ID'].value_counts()[final['ID'].value_counts() > 1].index

In [26]:
aux = pd.DataFrame()
for sku in repeated_skus:
    sku_to_save = final[(final['ID']==sku)&(final['Producto'].str.contains('NUEVA IMAGEN'))]
    aux = pd.concat([aux, sku_to_save])

In [33]:
final = final[~final['ID'].isin(repeated_skus)]

final = pd.concat([final, aux])

## Add some products

In [48]:
constant_skus = {'ID':
                 ['UNIDROGAS S.A.650240004605',
                  'UNIDROGAS S.A.650240009556',
                  'UNIDROGAS S.A.650240010583',
                  'UNIDROGAS S.A.650240009549',
                  'UNIDROGAS S.A.15797'],
                 'Fecha':['X', 'X', 'X', 'X', 'X'],
                 'Cliente':['UNIDROGAS S.A.', 
                            'UNIDROGAS S.A.',
                            'UNIDROGAS S.A.',
                            'UNIDROGAS S.A.',
                            'UNIDROGAS S.A.'],
                 'EAN':[650240004605,
                        650240009556,
                        650240010583,
                        650240009549,
                        15797],
                 'Producto':['X',
                        'ASEPXIA MAQUILLAJE POLVO COMPACTO BEIGE MATE',
                        'ASEPXIA MAQUILLAJE POLVO CLARO MATE',
                        'ASEPXIA MAQUILLAJE POLVO COMPACTO NATURAL',
                        'X'],
                 'LISTA':[13830,
                          18086,
                          18086, 
                          18086,
                          37897]}

In [50]:
final = pd.concat([final, pd.DataFrame(constant_skus)])

# Export

In [56]:
cols = ['ID', 'Fecha', 'Cliente', 'EAN', 'Producto', 'LISTA']

In [58]:
path_export = r'C:\Users\jshernandezm\OneDrive - genommalabinternacional\Colombia\Price Update\Data\Output'
filename = 'Catalogo_Precios-{}.xlsx'
filename = filename.format(datetime.today().strftime('%Y%m%d-%H%M'))
final[cols].to_excel(
    path_export + '\\' + filename,
    index=False, 
    sheet_name='Precios'
)

total_files = [f for f in os.listdir(path) if 'LISTA DE PRECIOS' in f]

def get_date(filename):
    date = filename.split(' + ')[0].split(' ')[3]
    if len(date) == 6:
        date = date[:4] + '2021'
    return datetime.strptime(date, '%d%m%Y')

df_files = pd.DataFrame(total_files, columns=['Names'])

df_files['Dates'] = df_files['Names'].map(get_date)
df_files['Months'] = df_files['Dates'].apply(lambda x: int(str(x.year) + str(x.month).zfill(2)))

df_files.sort_values(by='Dates', inplace=True)

df_files.reset_index(drop=True, inplace=True)

%%time
path ='../../../02PriceUpdate/COL/1Data/2Catalogue/1Current/'
previous_files = []
df = pd.DataFrame()
for month in df_files['Months'].unique():
    files = df_files.loc[df_files['Months'] <= month, 'Names'].tolist()
    miss_files = [e for e in files if e not in previous_files]
    for file in miss_files:
        try:
            aux = pd.read_excel(path + file, sheet_name='Lista de Precios')
        except:
            aux = pd.read_excel(path + file, sheet_name='Hoja1')

        if 'Unnamed: 7' in aux.columns:
            aux = get_data(aux, 'Unnamed: 7')

        date = file.split(' + ')[0].split(' ')[3]
        if len(date) == 6:
            date = date[:4] + '2021'

        aux.rename({'ean':'EAN', 'Ean':'EAN'}, axis=1, inplace=True)
        aux['Fecha'] = datetime.strptime(date, '%d%m%Y')
        df = pd.concat([df, aux[['Fecha', 'Cliente', 'EAN', 'Producto', 'LISTA']]])
        
    data = filter_clients(df)
    data = clean_ean(data)
    final = get_unique_sku(data)

    path_export = '../../../02PriceUpdate/COL/1Data/2Catalogue/1Current/Precios/'
    filename = 'Catalogo_Precios-' + str(month) + '.xlsx'
    export_file(final, path_export, filename)
        
    previous_files = files