In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..')))

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()  # Carga las variables del archivo .env

config = {
    'host': os.getenv('HOST'),
    'user': os.getenv('USER'),
    'password': os.getenv('PASSWORD'),
    'database': os.getenv('DATABASE'),
    'port': os.getenv('PORT')
}

In [3]:
import mysql.connector

connection = mysql.connector.connect(**config)


In [4]:
cursor = connection.cursor()


<center><h1><b>Generación Tablas</b></h1</center>

In [None]:
# Ejecuta DROP y luego la generación de la tabla para reproducibilidad
cursor.execute("DROP TABLE IF EXISTS caso_5.weekly_sales;")
cursor.execute("""
CREATE TABLE caso_5.weekly_sales (
    week_date VARCHAR(7),
    region VARCHAR(13),
    platform VARCHAR(7),
    segment VARCHAR(4),
    customer_type VARCHAR(8),
    transactions INT,
    sales INT
);
""")
connection.commit()

In [None]:
from utils.funciones_utiles import load_text_as_tuples, verifica_tabla

cursor.executemany(
    '''INSERT INTO caso_5.weekly_sales VALUES (%s, %s, %s, %s, %s, %s, %s)''', 
    load_text_as_tuples('datos_tablas/data_weekly_sales.txt'))

connection.commit()


In [8]:
verifica_tabla('weekly_sales', connection)

Unnamed: 0,week_date,region,platform,segment,customer_type,transactions,sales
0,31/8/20,ASIA,Retail,C3,New,120631,3656163
1,31/8/20,ASIA,Retail,F1,New,31574,996575
2,31/8/20,USA,Retail,,Guest,529151,16509610
3,31/8/20,EUROPE,Retail,C1,New,4517,141942
4,31/8/20,AFRICA,Retail,C2,New,58046,1758388


---

## **REVISION**

## Revisando nulos

In [17]:
query = '''
SELECT 
    SUM(CASE WHEN week_date IS NULL OR LOWER(week_date) = 'null' THEN 1 ELSE 0 END) AS week_date_nulls,
    SUM(CASE WHEN region IS NULL OR LOWER(region) = 'null' THEN 1 ELSE 0 END) AS region_nulls,
    SUM(CASE WHEN platform IS NULL OR LOWER(platform) = 'null' THEN 1 ELSE 0 END) AS platform_nulls,
    SUM(CASE WHEN segment IS NULL OR LOWER(segment) = 'null' THEN 1 ELSE 0 END) AS segment_nulls,
    SUM(CASE WHEN customer_type IS NULL OR LOWER(customer_type) = 'null' THEN 1 ELSE 0 END) AS customer_type_nulls,
    SUM(CASE WHEN transactions IS NULL OR transactions = 'null' THEN 1 ELSE 0 END) AS transactions_nulls,
    SUM(CASE WHEN sales IS NULL OR sales = 'null' THEN 1 ELSE 0 END) AS sales_nulls
FROM caso_5.weekly_sales;


'''


pd.read_sql_query(query, connection)

  pd.read_sql_query(query, connection)


Unnamed: 0,week_date_nulls,region_nulls,platform_nulls,segment_nulls,customer_type_nulls,transactions_nulls,sales_nulls
0,0.0,0.0,0.0,3024.0,0.0,0.0,1.0


🎇 **Insight:**

Dentro de todas las columnas existen dos que poseen datos nulos, `segment` y `sales_nulls` los cuales consisten en NULL o null como str. Ahora dentro de todas estas columnas podrian existir mas datos invalidos los que deberán ser revisados.

In [21]:
query = '''
SELECT 
    column_name,
    data_type
FROM information_schema.columns
WHERE table_schema = 'caso_5'
    AND table_name = 'weekly_sales';

'''


pd.read_sql_query(query, connection)

  pd.read_sql_query(query, connection)


Unnamed: 0,COLUMN_NAME,DATA_TYPE
0,week_date,varchar
1,region,varchar
2,platform,varchar
3,segment,varchar
4,customer_type,varchar
5,transactions,int
6,sales,int


🎇 **Insight:**

Las columnas parecen estar bien a excepción de week_date que el tipo de dato es VARCHAR y deberia ser DATE.

In [None]:
query = '''
    SELECT DISTINCT 
        region 
    FROM weekly_sales
    UNION 

    SELECT '----platform_unicos----'

    UNION 

    SELECT DISTINCT 
        platform
    FROM weekly_sales
    UNION 

    SELECT '----segment_unicos----'

    UNION 

    SELECT DISTINCT 
        segment
    FROM weekly_sales

    UNION 

    SELECT '----customer_type_unicos----'

    UNION 

    SELECT DISTINCT 
        customer_type
    FROM weekly_sales;

'''


pd.read_sql_query(query, connection)

  pd.read_sql_query(query, connection)


Unnamed: 0,region
0,ASIA
1,USA
2,EUROPE
3,AFRICA
4,CANADA
5,OCEANIA
6,SOUTH AMERICA
7,----platform_unicos----
8,Retail
9,Shopify


🎇 **Insight:**

Las features categoricas sin problemas son `platform` y `customer_type`.

Por otro lado las features con problemas son 

`region`:

- Todos los registros en mayus
- Contiene registros de paises y deberian ser todos región ( ej USA/CANADA deberia ser AMERICA o NORTH AMERICA)


`segment_unicos`: 

- Contiene dato nullo representado por la string 'null'


In [41]:
query = '''
    SELECT 
        MIN(transactions) AS minimo_transaction,
        MAX(transactions) AS maximo_transaction, 
        MIN(sales) AS minimo_sales,
        MAX(sales) AS maximo_sales
    FROM
        weekly_sales
'''


pd.read_sql_query(query, connection)

  pd.read_sql_query(query, connection)


Unnamed: 0,minimo_transaction,maximo_transaction,minimo_sales,maximo_sales
0,1,2578158,0,69763805


<center><h1><b>Preguntas</b></h1</center>
