In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np





In [2]:
data = pd.read_csv('./data/Consumidor_Venta_Producto_UPC_Recom_2018_2020.csv')
english_cols = {'FACTURA_ID': 'INVOICE_ID', 'FACTURA_POSICION_ID': 'INVOICE_POSITION_ID', 'CUSTOMER_ID': 'CUSTOMER_ID',
                'FACTURA_CLASE_DOCUMENTO_ID': 'INVOICE_DOCUMENT_CLASS_ID', 'ANO_MES_FACTURA': 'INVOICE_MONTH_YEAR',
                'ANO_FACTURA': 'ORDER_YEAR', 'MES_FACTURA': 'ORDER_MONTH', 'FECHA_FACTURA': 'INVOICE_DATE',
                'IMP_VENTA_NETO_EUR': 'NET_SALES_EUR', 'CANAL_VENTA_ID': 'SALES_CHANNEL_ID',
                'CANAL_VENTA_DESC': 'SALES_CHANNEL_DESC', 'TEMPORADA_COMERCIAL_ID': 'SEASON_ID',
                'TEMPORADA_COMERCIAL_DESC': 'SEASON_DESC', 'PRODUCTO_ID': 'PRODUCT_ID', 'TALLA': 'SIZE',
                'MATERIAL_ID': 'MATERIAL_ID', 'NUMERO_DEUDOR_PAIS_ID': 'SALE_COUNTRY_ID',
                'NUMERO_DEUDOR_PAIS_DESC': 'SALE_COUNTRY_DESC', 'VENTA_DEVOLUCION': 'SALES_FILTER',
                'JERARQUIA_PROD_ID': 'PRODUCT_HIERARCHY_ID', 'GRUPO_ARTICULO_PRODUCTO_ID': 'PRODUCT_GROUP_ID',
                'GRUPO_ARTICULO': 'PRODUCT_GROUP', 'CONCEPTO': 'CONCEPT', 'LINEA': 'LINE',
                'GENERO_PRODUCTO': 'PRODUCT_GENDER', 'CATEGORIA': 'PRODUCT_CATEGORY', 'TIPOLOGIA': 'PRODUCT_TYPE',
                'COLOR': 'COLOR_INTERN', 'CONSUMER_COLOR': 'COLOR_EXTERN', 'CREMALLERA': 'ZIPPER', 'CORDONES': 'LACES',
                'OUTSOLE_SUELA_TIPO': 'SOLE_TYPE', 'OUTSOLE_SUELA_SUBTIPO': 'SOLE_SUBTYPE',
                'PLANTILLA_EXTRAIBLE': 'REMOVABLE_SOLE', 'CONTACTO_SN': 'CONTACT', 'EDAD_SN': 'AGE_AVAILABLE',
                'GENERO_CONTACTO': 'GENDER_AVAILABLE', 'EDAD_COMPRA': 'AGE_AT_PURCHASE',
                'EDAD_RANGO_COMPRA': 'AGE_RANGE',
                'PAIS_CONTACTO': 'COUNTRY_CONTACT_ID', 'PAIS_CONTACTO_DESC': 'COUNTRY_CONTACT_DESC',
                'CIUDAD_CONTACTO': 'CITY_CONTACT', 'IDIOMA_CONTACTO': 'LANGUAGE_CONTACT'}


In [3]:
dropcols = ['INVOICE_MONTH_YEAR', 'ORDER_YEAR','ORDER_MONTH','SALES_CHANNEL_ID',\
            'SALES_CHANNEL_DESC','AGE_AVAILABLE','MATERIAL_ID','COUNTRY_CONTACT_DESC']

In [4]:
country_dict = {x['NUMERO_DEUDOR_PAIS_ID']:x['NUMERO_DEUDOR_PAIS_DESC'] for x in data[['NUMERO_DEUDOR_PAIS_ID','NUMERO_DEUDOR_PAIS_DESC']].drop_duplicates().to_dict('index').values()}
country_dict


{'DE': 'GERMANY',
 'RO': 'RUMANÍA',
 'ES': 'SPAIN',
 'GB': 'UNITED KINGDOM OF GREAT BRITAIN AND NORTHERN IRELAND',
 'IT': 'ITALY',
 'US': 'UNITED STATES',
 'AU': 'AUSTRALIA',
 'PT': 'PORTUGAL',
 'BG': 'BULGARIA',
 'BE': 'BÉLGICA',
 'GR': 'GREECE',
 'CH': 'SWITZERLAND',
 'PL': 'POLAND',
 'TR': 'TURKEY',
 'JP': 'JAPAN',
 'FR': 'FRANCE',
 'IE': 'IRELAND',
 'CA': 'CANADA',
 'AT': 'AUSTRIA',
 'MX': 'MEXICO',
 'LT': 'LITUANIA',
 'RU': 'RUSSIAN FEDERATION',
 'NL': 'HOLLAND',
 'DK': 'DENMARK',
 'HR': 'CROACIA',
 'IL': 'ISRAEL',
 'CY': 'CHIPRE',
 'SI': 'ESLOVENIA',
 'HU': 'HUNGRÍA',
 'IS': 'ISLANDIA',
 'FI': 'FINLANDIA',
 'LU': 'LUXEMBURGO',
 'LV': 'LETONIA',
 'MT': 'MALTA',
 'CZ': 'REPÚBLICA CHECA',
 'EE': 'ESTONIA',
 'BA': 'BOSNIA-HERZEGOVINA',
 'SE': 'SWEDEN',
 'TH': 'THAILAND',
 'SK': 'ESLOVAQUIA',
 'NO': 'NORUEGA',
 'SG': 'SINGAPUR',
 'ZA': 'SOUTH AFRICA',
 'HK': 'HONG KONG',
 'KR': 'KOREA REPUBLIC OF',
 'NZ': 'NUEVA ZELANDA',
 'CN': 'CHINA',
 'TW': 'TAIWAN',
 'AD': 'ANDORRA',
 'MC': 'MÓNA

In [5]:
def preprocess(d):
    d_copy = d.rename(columns=english_cols)
    d_copy['REMOVABLE_SOLE'] = d_copy['REMOVABLE_SOLE'].apply(lambda x: True if x == 'Extraible' else False)
    
    invoice_id_dict = {'ZTON':'Sale','ZDVN':'Return','ZDAN':'Cancelled Return'}
    d_copy['INVOICE_DOCUMENT_CLASS_ID'] = d_copy['INVOICE_DOCUMENT_CLASS_ID'].apply(lambda x: invoice_id_dict[x])
    
    d_copy['INVOICE_DATE'] = pd.to_datetime(d_copy.INVOICE_DATE)
    
    d_copy.AGE_AT_PURCHASE = d_copy.AGE_AT_PURCHASE.replace(0, np.nan)
    
    d_copy.ZIPPER = d_copy.ZIPPER.apply(lambda x: True if x in ('SI','YES') else False)
    d_copy.LACES = d_copy.LACES.apply(lambda x: True if x in ('With laces', 'Con cordones') else False)
    
    
    return d_copy



In [None]:
cleandata = preprocess(data)
cleandata = cleandata.drop(columns=dropcols)



In [None]:
cleandata.columns


In [None]:
cleandata.SEASON_DESC.unique()

In [9]:
cleandata.columns


Index(['INVOICE_ID', 'INVOICE_POSITION_ID', 'CUSTOMER_ID',
       'INVOICE_DOCUMENT_CLASS_ID', 'INVOICE_DATE', 'NET_SALES_EUR',
       'SEASON_ID', 'SEASON_DESC', 'PRODUCT_ID', 'SIZE', 'SALE_COUNTRY_ID',
       'SALE_COUNTRY_DESC', 'SALES_FILTER', 'PRODUCT_HIERARCHY_ID',
       'PRODUCT_GROUP_ID', 'PRODUCT_GROUP', 'CONCEPT', 'LINE',
       'PRODUCT_GENDER', 'PRODUCT_CATEGORY', 'PRODUCT_TYPE', 'COLOR_INTERN',
       'COLOR_EXTERN', 'ZIPPER', 'LACES', 'SOLE_TYPE', 'SOLE_SUBTYPE',
       'REMOVABLE_SOLE', 'CONTACT', 'GENDER_AVAILABLE', 'AGE_AT_PURCHASE',
       'AGE_RANGE', 'COUNTRY_CONTACT_ID', 'CITY_CONTACT', 'LANGUAGE_CONTACT'],
      dtype='object')

In [8]:
cleandata.SEASON_DESC.unique()

array(['89 - F/W 2019', '85 - F/W 2017', '88 - S/S 2019', '90 - S/S 2020',
       '86 - S/S 2018', '87 - F/W 2018', '91 - F/W 2020'], dtype=object)