# **Pandas**

### **Librerías**

In [1]:
import pandas as pd
import numpy as np

### **Datos**

In [2]:
# Data para el producto A

df_a = pd.DataFrame({
    'Month':pd.date_range(
        start = '01-01-2012',
        end = '31-12-2022',
        freq = 'MS'
    ),
    'Quotes':np.random.randint(
        low = 1_000_000,
        high = 2_500_000,
        size = 132
    ),
    'Numbers':np.random.randint(
        low = 300_000,
        high = 500_000,
        size = 132
    ),
    'Amounts':np.random.randint(
        low = 750_000,
        high = 1_250_000,
        size = 132
    )
})

df_a['Product'] = 'A'

# Data para el producto B

df_b = pd.DataFrame({
    'Month':pd.date_range(
        start = '01-01-2012',
        end = '31-12-2022',
        freq = 'MS'
    ),
    'Quotes':np.random.randint(
        low = 100_000,
        high = 800_000,
        size = 132
    ),
    'Numbers':np.random.randint(
        low = 10_000,
        high = 95_000,
        size = 132
    ),
    'Amounts':np.random.randint(
        low = 450_000,
        high = 750_000,
        size = 132
    )
})

df_b['Product'] = 'B'

In [3]:
# Combinamos los datos
df = pd.concat([df_a, df_b], axis=0)
df.sort_values(by='Month', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Month,Quotes,Numbers,Amounts,Product
0,2012-01-01,1297877,427302,893590,A
1,2012-01-01,557071,44383,648487,B
2,2012-02-01,2493819,358599,1089789,A
3,2012-02-01,150071,31385,738013,B
4,2012-03-01,1307998,359458,807437,A


In [4]:
# Columnas calculadas 
df['Average_Sale'] = df['Amounts'] / df['Numbers']
df['Product_Conversion'] = df['Numbers'] / df['Quotes']

In [5]:
df.head()

Unnamed: 0,Month,Quotes,Numbers,Amounts,Product,Average_Sale,Product_Conversion
0,2012-01-01,1297877,427302,893590,A,2.091238,0.329232
1,2012-01-01,557071,44383,648487,B,14.611157,0.079672
2,2012-02-01,2493819,358599,1089789,A,3.039019,0.143795
3,2012-02-01,150071,31385,738013,B,23.514832,0.209134
4,2012-03-01,1307998,359458,807437,A,2.246262,0.274815


### **1. Ajuste del formato**

In [9]:
# Ajustamos los formatos de los números
df.head().style.format({
    # Ajustamos el formato de fecha
    # 'Month': '{:%Y-%m}',
    # 'Month': '{:%B-%Y}',
    'Month': '{:%b-%Y}',

    # Ajustamos el formato de los números
    'Quotes': '{:,.0f}',
    'Numbers': '{:,.0f}',
    'Amounts': '${:,.0f}',
    # 'Average_Sale': '{:,.2f} ($)',
    'Average_Sale': '${:,.2f}',
    'Product_Conversion': '{:.2%}',
})
# .hide()

Unnamed: 0,Month,Quotes,Numbers,Amounts,Product,Average_Sale,Product_Conversion
0,Jan-2012,1297877,427302,"$893,590",A,$2.09,32.92%
1,Jan-2012,557071,44383,"$648,487",B,$14.61,7.97%
2,Feb-2012,2493819,358599,"$1,089,789",A,$3.04,14.38%
3,Feb-2012,150071,31385,"$738,013",B,$23.51,20.91%
4,Mar-2012,1307998,359458,"$807,437",A,$2.25,27.48%


### **2. Ajuste del formato condicional**

##### **Filtrar por una categoría**

In [53]:
def highlight_product(row_data, product, color='yellow'):
    """
    Resalta las filas del DataFrame basándose en el valor de la columna 'Product'.

    Args:
        row_data (pd.Series): Una fila del DataFrame.
        product (any): La columna con la cual se resaltarán las filas.
        color (str, optional): El color para resaltar las filas. Por defecto, 'yellow'.

    Returns:
        List[str]: Lista de strings con estilo CSS para resaltar la fila.
    """
    
    # Crea una nueva Serie de pandas llamada 'row' con valores booleanos inicializados en False
    row = pd.Series(data=False, index=row_data.index)

    # Establece el valor en True para la columna 'Product' si el valor en esa fila es igual al producto proporcionado
    row['Product'] = row_data.loc['Product'] == product

    # Retorna una lista de strings con el formato 'background-color: {color}' si al menos un valor en 'row' es True
    return [f'background-color: {color}' if row.any() else '' for value in row]


In [54]:
# Aplicamos la condición sobre las categorías
df.head().style.apply(
    highlight_product, product='A', color='#18314f', axis=1
).format({
    # Ajustamos el formato de fecha
    'Month': '{:%b-%Y}',

    # Ajustamos el formato de los números
    'Quotes': '{:,.0f}',
    'Numbers': '{:,.0f}',
    'Amounts': '${:,.0f}',
    'Average_Sale': '${:,.2f}',
    'Product_Conversion': '{:.2%}',
}).hide()

Month,Quotes,Numbers,Amounts,Product,Average_Sale,Product_Conversion
Jan-2012,1297877,427302,"$893,590",A,$2.09,32.92%
Jan-2012,557071,44383,"$648,487",B,$14.61,7.97%
Feb-2012,2493819,358599,"$1,089,789",A,$3.04,14.38%
Feb-2012,150071,31385,"$738,013",B,$23.51,20.91%
Mar-2012,1307998,359458,"$807,437",A,$2.25,27.48%


##### **Filtrar por valores numéricos**

In [28]:
def highlight_average_sale(s, sale_threshold=5, color='yellow'):
    row = pd.Series(data=False, index=s.index)
    row['Product'] = s.loc['Average_Sale'] > sale_threshold

    return [f'background-color: {color}' if row.any() else '' for value in row]

In [33]:
# Aplicamos la condición sobre valores numéricos
df.head().style.apply(
    highlight_average_sale, sale_threshold=3, axis=1, color='#18314f'
).format({
    # Ajustamos el formato de fecha
    'Month': '{:%b-%Y}',

    # Ajustamos el formato de los números
    'Quotes': '{:,.0f}',
    'Numbers': '{:,.0f}',
    'Amounts': '${:,.0f}',
    'Average_Sale': '${:,.2f}',
    'Product_Conversion': '{:.2%}',
}).hide()

Month,Quotes,Numbers,Amounts,Product,Average_Sale,Product_Conversion
Jan-2012,1297877,427302,"$893,590",A,$2.09,32.92%
Jan-2012,557071,44383,"$648,487",B,$14.61,7.97%
Feb-2012,2493819,358599,"$1,089,789",A,$3.04,14.38%
Feb-2012,150071,31385,"$738,013",B,$23.51,20.91%
Mar-2012,1307998,359458,"$807,437",A,$2.25,27.48%


In [4]:
marketing = pd.read_csv('./data/DirectMarketing.csv')

In [5]:
marketing.head()

Unnamed: 0,Age,Gender,OwnHome,Married,Location,Salary,Children,History,Catalogs,AmountSpent
0,Old,Female,Own,Single,Far,47500,0,High,6,755
1,Middle,Male,Rent,Single,Close,63600,0,High,6,1318
2,Young,Female,Rent,Single,Close,13500,0,Low,18,296
3,Middle,Male,Own,Married,Close,85600,1,High,18,2436
4,Middle,Female,Own,Single,Close,68400,0,High,12,1304


In [14]:
groceries = pd.read_csv('./data/Groceries_dataset.csv')

In [15]:
groceries.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


### 2. Cambiar los tipos de datos

In [16]:
groceries.dtypes

Member_number       int64
Date               object
itemDescription    object
dtype: object

In [12]:
groceries['Date'] = groceries['Date'].astype('datetime64')

In [13]:
groceries.dtypes

Member_number               int64
Date               datetime64[ns]
itemDescription            object
dtype: object

### 3. Cambiar el formato a fecha

In [17]:
groceries['Date'] = pd.to_datetime(groceries['Date'])

In [19]:
groceries.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,2015-07-21,tropical fruit
1,2552,2015-05-01,whole milk
2,2300,2015-09-19,pip fruit
3,1187,2015-12-12,other vegetables
4,3037,2015-01-02,whole milk
