# E-Commerce project

Project that analyzes fictional data about... XXXXXXXXXXXXXXXX

# Extraction and Transformation

## Creating the requiered data

In [1]:
!pip install faker

import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Inicialización
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Fechas límite comunes
start_date_2022 = datetime(2022, 1, 1).date()
end_date_2024 = datetime(2024, 12, 31).date()

# -------------------------------------------
# 1. PRODUCTS TABLE
# -------------------------------------------
categories = ['Home', 'Electronics', 'Clothing', 'Books', 'Toys', 'Sports']
subcategories = {
    'Home': ['Furniture', 'Kitchen', 'Decor'],
    'Electronics': ['Mobile', 'Laptop', 'Audio'],
    'Clothing': ['Men', 'Women', 'Kids'],
    'Books': ['Fiction', 'Non-Fiction', 'Comics'],
    'Toys': ['Puzzles', 'Action Figures', 'Dolls'],
    'Sports': ['Fitness', 'Outdoor', 'Team Sports']
}

products = []
for i in range(1, 101):
    cat = random.choice(categories)
    subcat = random.choice(subcategories[cat])
    cost = round(random.uniform(5, 200), 2)
    base_price = round(cost * random.uniform(1.2, 2.0), 2)
    stock = random.randint(10, 500)
    products.append([i, f"{fake.word().capitalize()} {subcat}", cat, subcat, cost, base_price, stock])

df_products = pd.DataFrame(products, columns=[
    'product_id', 'product_name', 'category', 'subcategory', 'cost', 'base_price', 'stock'
])
df_products.to_csv("products.csv", index=False)

# -------------------------------------------
# 2. CUSTOMERS TABLE
# -------------------------------------------
customers = []
for i in range(1, 2001):
    birth = fake.date_of_birth(minimum_age=18, maximum_age=70)
    signup = fake.date_between(start_date=start_date_2022, end_date=end_date_2024)
    income_level = random.choices(['low', 'medium', 'high'], weights=[0.3, 0.5, 0.2])[0]
    customers.append([
        i, fake.first_name(), fake.last_name(), random.choice(['M', 'F']),
        birth, signup, fake.city(), fake.country(), income_level
    ])

df_customers = pd.DataFrame(customers, columns=[
    'customer_id', 'first_name', 'last_name', 'gender',
    'birth_date', 'signup_date', 'city', 'country', 'income_level'
])
df_customers.to_csv("customers.csv", index=False)

# -------------------------------------------
# 3. STORES TABLE
# -------------------------------------------
store_types = ['physical', 'online']
regions = ['North', 'South', 'East', 'West', 'Central']

stores = []
for i in range(1, 21):
    stores.append([
        i, f"Store {i}", random.choice(store_types),
        random.choice(regions), fake.city()
    ])

df_stores = pd.DataFrame(stores, columns=[
    'store_id', 'store_name', 'store_type', 'region', 'city'
])
df_stores.to_csv("stores.csv", index=False)

# -------------------------------------------
# 4. EMPLOYEES TABLE
# -------------------------------------------
positions = ['Sales Rep', 'Manager', 'Cashier']

employees = []
for i in range(1, 151):
    employees.append([
        i, fake.name(), fake.date_between(start_date=datetime(2018,1,1).date(), end_date=end_date_2024),
        random.choice(positions), random.randint(1, 20)
    ])

df_employees = pd.DataFrame(employees, columns=[
    'employee_id', 'full_name', 'hire_date', 'position', 'store_id'
])
df_employees.to_csv("employees.csv", index=False)

# -------------------------------------------
# 5. CAMPAIGNS TABLE
# -------------------------------------------
channels = ['Email', 'Social Media', 'TV', 'Radio', 'In-store']

campaigns = []
for i in range(1, 51):
    start = fake.date_between(start_date=datetime(2022,1,1).date(), end_date=end_date_2024 - timedelta(days=90))
    end = start + timedelta(days=random.randint(10, 90))
    campaigns.append([
        i, f"Campaign {i}", random.choice(channels),
        start, end, round(random.uniform(1000, 10000), 2)
    ])

df_campaigns = pd.DataFrame(campaigns, columns=[
    'campaign_id', 'campaign_name', 'channel', 'start_date', 'end_date', 'budget'
])
df_campaigns.to_csv("campaigns.csv", index=False)

# -------------------------------------------
# 6. SALES TABLE
# -------------------------------------------
sales = []
for i in range(1, 20001):
    sale_date = fake.date_between(start_date=start_date_2022, end_date=end_date_2024)
    quantity = random.randint(1, 5)
    product = df_products.sample(1).iloc[0]
    unit_price = round(product['base_price'] * random.uniform(0.9, 1.1), 2)
    discount = round(random.choice([0, 0.05, 0.1, 0.15]), 2)
    sales.append([
        i,
        random.randint(1, 2000),  # customer_id
        product['product_id'],
        random.randint(1, 20),    # store_id
        random.randint(1, 150),   # employee_id
        random.randint(1, 50),    # campaign_id
        sale_date,
        quantity,
        unit_price,
        discount,
        random.choice(['credit_card', 'cash', 'paypal', 'bank_transfer'])
    ])

df_sales = pd.DataFrame(sales, columns=[
    'sale_id', 'customer_id', 'product_id', 'store_id',
    'employee_id', 'campaign_id', 'sale_date',
    'quantity', 'unit_price', 'discount', 'payment_method'
])
df_sales.to_csv("sales.csv", index=False)

# -------------------------------------------
# 7. RETURNS TABLE
# -------------------------------------------
returns = []
returned_sales = df_sales.sample(frac=0.075, random_state=42)

for idx, row in returned_sales.iterrows():
    reason = random.choice(['defective', 'wrong size', 'changed mind', 'late delivery'])
    refund = round(row['unit_price'] * row['quantity'] * (1 - row['discount']), 2)
    return_date = pd.to_datetime(row['sale_date']) + timedelta(days=random.randint(1, 30))
    returns.append([
        idx + 1, row['sale_id'], return_date.date(), reason, refund
    ])

df_returns = pd.DataFrame(returns, columns=[
    'return_id', 'sale_id', 'return_date', 'return_reason', 'refund_amount'
])
df_returns.to_csv("returns.csv", index=False)

# -------------------------------------------
# 8. DATES TABLE
# -------------------------------------------
date_range = pd.date_range(start='2022-01-01', end='2024-12-31')
df_dates = pd.DataFrame({
    'date': date_range,
    'year': date_range.year,
    'month': date_range.month,
    'quarter': date_range.quarter,
    'week': date_range.isocalendar().week,
    'weekday': date_range.day_name(),
    'is_weekend': date_range.dayofweek >= 5
})
df_dates.to_csv("dates.csv", index=False)

print("✅ All CSVs generated and saved successfully!")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
✅ All CSVs generated and saved successfully!


## Extracting and transforming all tables (SQL)

### Campaigns

In [2]:
campaigns = _dntk.execute_sql(
  'SELECT *\n,sum(budget) over(PARTITION BY channel) as channel_total_budget -- Adds column with total budget per channel\n,count(campaign_id) over(PARTITION BY channel) as channel_campaigns_count -- Adds column with total number of campaigns per channel\n,sum(budget) over() as total_budget -- calculates total budget for all campaigns\n,round(budget / sum(budget) over() * 100,2) as global_budget_perc -- calculates % of every campaign\'s budget vs total budget\n,dense_rank() over(PARTITION BY channel ORDER BY budget desc) as ranking_per_channel -- adds ranking of budget per channel\n,end_date - start_date as duration_days -- calcultaes duration in days for every campaign\nFROM df_campaigns',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
campaigns

Unnamed: 0,campaign_id,campaign_name,channel,start_date,end_date,budget,channel_total_budget,channel_campaigns_count,total_budget,global_budget_perc,ranking_per_channel,duration_days
0,38,Campaign 38,Social Media,2024-01-07,2024-03-06,9818.14,59857.21,12,270859.29,3.62,1,59
1,22,Campaign 22,Social Media,2022-07-19,2022-09-14,7819.6,59857.21,12,270859.29,2.89,2,57
2,30,Campaign 30,Social Media,2022-11-24,2023-02-21,7255.64,59857.21,12,270859.29,2.68,3,89
3,12,Campaign 12,Social Media,2023-07-11,2023-10-02,6050.51,59857.21,12,270859.29,2.23,4,83
4,41,Campaign 41,Social Media,2023-08-12,2023-09-14,5505.86,59857.21,12,270859.29,2.03,5,33
5,10,Campaign 10,Social Media,2023-04-01,2023-04-27,5328.3,59857.21,12,270859.29,1.97,6,26
6,50,Campaign 50,Social Media,2023-08-04,2023-09-11,4076.08,59857.21,12,270859.29,1.5,7,38
7,6,Campaign 6,Social Media,2023-09-14,2023-10-12,4025.51,59857.21,12,270859.29,1.49,8,28
8,13,Campaign 13,Social Media,2024-01-04,2024-03-21,3180.84,59857.21,12,270859.29,1.17,9,77
9,28,Campaign 28,Social Media,2024-08-19,2024-11-05,2834.21,59857.21,12,270859.29,1.05,10,78


In [3]:
campaigns = _dntk.execute_sql(
'''
SELECT *
    ,SUM(budget) OVER(PARTITION BY channel) AS channel_total_budget -- Adds column with total budget per channel
    ,COUNT(campaign_id) OVER(PARTITION BY channel) AS channel_campaigns_count -- Adds column with total number of campaigns per channel
    ,SUM(budget) OVER() AS total_budget -- Calculates total budget for all campaigns
    ,ROUND(budget / SUM(budget) OVER() * 100, 2) AS global_budget_perc -- Calculates % of every campaign's budget vs total budget
    ,DENSE_RANK() OVER(PARTITION BY channel ORDER BY budget DESC) AS ranking_per_channel -- Adds ranking of budget per channel
    ,end_date - start_date AS duration_days -- Calculates duration in days for every campaign
FROM df_campaigns
''',
'SQL_DEEPNOTE_DATAFRAME_SQL',
return_variable_type='dataframe'
)


### Sales

In [4]:
sales = _dntk.execute_sql(
  'with s as \n\n(SELECT *\n,round((quantity * unit_price) * (1 - discount),2) as total_revenue -- calculates total revenue\n,round(unit_price * (1 - discount),2) as unit_price_after_discount -- calculates unit price minus discount\n,count(payment_method) over(PARTITION BY payment_method) as method_usage_count -- calculates number of times every payment method is used\n,count(customer_id) over(PARTITION BY customer_id) as customer_purchase_count -- calculates number of purchases made by every customer\n,first(sale_date) over(PARTITION by customer_id order by sale_date asc) as date_first_purchase -- brings date of first purchase of every customer\n,row_number() over(PARTITION BY customer_id ORDER BY sale_date asc) as purchase_secuence_customer -- ranks number of purchases pero ascending date\nFROM df_sales)\n\nselect *\n,datediff(\'day\',date_first_purchase, sale_date) as days_since_first_purchase -- calculates number of days of every new purchase since the first purchase, for every customer \n,round(sum(total_revenue) over(PARTITION by customer_id order by sale_date asc),2) as cumulative_revenue_per_customer -- calculates cumulative total spent per customer\nfrom s',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
sales

Unnamed: 0,sale_id,customer_id,product_id,store_id,employee_id,campaign_id,sale_date,quantity,unit_price,discount,payment_method,total_revenue,unit_price_after_discount,method_usage_count,customer_purchase_count,date_first_purchase,purchase_secuence_customer,days_since_first_purchase,cumulative_revenue_per_customer
0,6921,11,3,1,29,14,2022-05-01,5,102.64,0.00,bank_transfer,513.20,102.64,5139,10,2022-05-01,1,0,513.20
1,14641,11,27,16,95,25,2023-03-17,5,208.18,0.00,cash,1040.90,208.18,4997,10,2022-05-01,2,320,1554.10
2,19307,11,18,1,121,17,2023-04-12,1,312.46,0.10,bank_transfer,281.21,281.21,5139,10,2022-05-01,3,346,1835.31
3,14805,11,62,4,79,4,2023-09-19,3,328.55,0.10,paypal,887.09,295.70,4894,10,2022-05-01,4,506,3718.72
4,13658,11,76,20,43,29,2023-09-19,4,262.19,0.05,credit_card,996.32,249.08,4970,10,2022-05-01,5,506,3718.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,8020,1988,35,9,43,22,2024-09-10,2,67.53,0.15,credit_card,114.80,57.40,4970,17,2022-09-07,13,734,6442.30
19996,12447,1988,36,1,42,48,2024-09-30,3,293.25,0.00,credit_card,879.75,293.25,4970,17,2022-09-07,14,754,7322.05
19997,19263,1988,52,1,56,38,2024-10-11,4,315.52,0.05,bank_transfer,1198.98,299.74,5139,17,2022-09-07,15,765,8521.03
19998,3707,1988,3,10,144,44,2024-11-23,2,101.27,0.15,bank_transfer,172.16,86.08,5139,17,2022-09-07,16,808,8693.19


### Customers

In [5]:
customers = _dntk.execute_sql(
  'with customers as (\nSELECT * \n,round((today() - birth_date) / 365,1) as customer_age -- calculates customer\'s age based on todays date \n,CAST(FLOOR(EXTRACT(YEAR FROM birth_date) / 10) * 10 AS INTEGER) || \'s\' AS customer_decade -- brings customer\'s decade of birth\n,EXTRACT(year from signup_date) as sign_up_year -- brings signup year of every customer\n,datediff(\'year\', signup_date, today()) as years_since_signup -- calculates years since signup\n,datediff(\'year\', birth_date, signup_date ) as age_at_signup -- calculates customer\'s age at signup\n,round((count(customer_id) over(PARTITION BY income_level) / count(customer_id) over()) * 100,1) as perc_income_level -- calculates % of number customers of every income level against total number of customers\n,count(customer_id) over(PARTITION BY country) as customers_per_country -- calculates number of clients in each country\n,round(count(customer_id) over(PARTITION BY country) / count(customer_id) over() * 100,1) as perc_customers_country_vs_total -- calculates % of customers per country against total number customers\n,count(customer_id) over(PARTITION BY income_level, country) as customers_per_income_per_country\n,round(count(customer_id) over(PARTITION BY income_level, country) /  count(customer_id) over(PARTITION BY country) * 100,1) as perc_customers_per_income_per_country\n,rank() OVER(PARTITION BY country ORDER BY signup_date asc) as rank_customer_signup -- ranks customers per signup date\n,case \n    when round((today() - birth_date) / 365,1) >= 18 and round((today() - birth_date) / 365,1) <= 27 then \'Gen Z\'\n    when round((today() - birth_date) / 365,1) > 27 and round((today() - birth_date) / 365,1) <= 42 then \'Millenial\'\n    when round((today() - birth_date) / 365,1) > 42 and round((today() - birth_date) / 365,1) <= 60 then \'Gen X\'\n    when round((today() - birth_date) / 365,1) > 60 then \'Boomer\'\nend as customers_cohort -- groups by customer\'s age in generational categories \n,if(datediff(\'year\', signup_date, today()) < 2,\'No\', \'Yes\') as is_loyal -- defines loyal customers for those over 1 year since signup\n\nFROM df_customers\n)\n\nselect *\n,count(customer_id) over(PARTITION BY customers_cohort) as customers_per_cohort -- counts customers per generational cohort\n,round(count(customer_id) over(PARTITION BY customers_cohort) / count(customer_id) over() * 100,1) as perc_customers_per_cohort -- calculates % of customers per cohort vs total customers\nfrom customers',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
customers

Unnamed: 0,customer_id,first_name,last_name,gender,birth_date,signup_date,city,country,income_level,customer_age,...,perc_income_level,customers_per_country,perc_customers_country_vs_total,customers_per_income_per_country,perc_customers_per_income_per_country,rank_customer_signup,customers_cohort,is_loyal,customers_per_cohort,perc_customers_per_cohort
0,945,Jamie,Gallegos,F,2005-04-06,2022-10-29,Millerside,New Caledonia,high,20.0,...,20.1,13,0.7,6,46.2,5,Gen Z,Yes,345,17.3
1,1176,Victor,Wood,F,2000-08-27,2022-10-24,Lisaland,Solomon Islands,high,24.6,...,20.1,9,0.5,4,44.4,4,Gen Z,Yes,345,17.3
2,281,Patricia,Ellison,F,2006-01-21,2024-12-02,East Rachel,Puerto Rico,medium,19.2,...,50.8,14,0.7,5,35.7,13,Gen Z,No,345,17.3
3,513,Desiree,Hernandez,F,2001-08-21,2023-08-14,Walkershire,Saint Martin,medium,23.6,...,50.8,18,0.9,9,50.0,10,Gen Z,Yes,345,17.3
4,697,Joe,Park,F,1999-11-27,2024-08-05,Garrettfurt,Iraq,high,25.4,...,20.1,8,0.4,5,62.5,8,Gen Z,No,345,17.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,58,Stephanie,Barnes,F,1994-03-11,2024-07-23,South Jeffrey,Jordan,low,31.1,...,29.2,11,0.5,3,27.3,10,Millenial,No,562,28.1
1996,1783,Micheal,Stephens,M,1991-10-21,2024-09-09,Wellsview,Jordan,high,33.5,...,20.1,11,0.5,3,27.3,11,Millenial,No,562,28.1
1997,802,Stephanie,Mathis,M,1983-11-13,2024-01-26,South Stevenland,Brazil,low,41.4,...,29.2,6,0.3,3,50.0,3,Millenial,No,562,28.1
1998,1676,Patrick,Holmes,M,1983-07-01,2022-06-21,South Andrew,Costa Rica,medium,41.8,...,50.8,6,0.3,4,66.7,1,Millenial,Yes,562,28.1


### Returns

In [6]:
returns = _dntk.execute_sql(
  'with returns as (\nSELECT *\n,count(return_id) over(PARTITION BY return_reason) as count_per_return_reason -- calculates number of returns per ever return reason\n,round(count(return_id) over(PARTITION BY return_reason) / count(return_id) over() * 100,1) as perc_per_return_reason -- calculates the percantage of retruns per return reason\n,round(sum(refund_amount) over(),0) as total_refunded -- calculates the total refunded for the whole table\n,round(refund_amount / sum(refund_amount) over() * 100,2) as perc_return_vs_total  -- calculates the % refunded of each register vs the total refunded\nfrom df_returns\n)\nselect *\n,dense_rank() over(order by count_per_return_reason desc) as ranking_return_reason\nFROM returns',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
returns

Unnamed: 0,return_id,sale_id,return_date,return_reason,refund_amount,count_per_return_reason,perc_per_return_reason,total_refunded,perc_return_vs_total,ranking_return_reason
0,18863,18863,2022-11-01,late delivery,984.06,395,26.3,642503.0,0.15,1
1,15832,15832,2022-08-25,late delivery,142.75,395,26.3,642503.0,0.02,1
2,9060,9060,2022-07-17,late delivery,315.72,395,26.3,642503.0,0.05,1
3,16293,16293,2024-04-19,late delivery,573.80,395,26.3,642503.0,0.09,1
4,2302,2302,2022-06-04,late delivery,1325.48,395,26.3,642503.0,0.21,1
...,...,...,...,...,...,...,...,...,...,...
1495,737,737,2023-05-05,defective,739.88,355,23.7,642503.0,0.12,4
1496,8137,8137,2022-08-24,defective,1035.43,355,23.7,642503.0,0.16,4
1497,3699,3699,2022-09-15,defective,439.65,355,23.7,642503.0,0.07,4
1498,12385,12385,2024-06-23,defective,226.30,355,23.7,642503.0,0.04,4


### Products

In [7]:
products = _dntk.execute_sql(
  'with products as (\nSELECT * \n,base_price - cost as product_margin -- calculates margin of every product\n,round((base_price - cost) / cost * 100,2) as perc_cost_margin -- calculates % of margin over cost\n,case\n    when stock < 100 then \'Low\'\n    when stock >= 100 and stock < 200 then \'Ok\'\n    else \'High\' end as stock_status -- determines if stock is low, ok or high\n,sum(base_price - cost) over(PARTITION BY category) as total_potential_margin_per_category -- calculates potential margin per every categpry (before actual sales)\n,round(avg(base_price - cost) over(PARTITION BY category),1) as avg_potential_margin_per_category -- calculates average potential margin per every categpry (before actual sales)\nFROM df_products\n)\nselect *\n,if(product_margin >= avg_potential_margin_per_category,\'Above Average\',\'below Average\') as product_margin_category -- classifies every product margin against avg product margin per category\n,dense_rank() over(order by product_margin desc) as ranking_products_per_margin\n,dense_rank() over(partition by category order by product_margin desc) as ranking_products_per_margin_within_category\nfrom products',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
products

Unnamed: 0,product_id,product_name,category,subcategory,cost,base_price,stock,product_margin,perc_cost_margin,stock_status,total_potential_margin_per_category,avg_potential_margin_per_category,product_margin_category,ranking_products_per_margin,ranking_products_per_margin_within_category
0,90,Blood Women,Clothing,Women,173.55,342.50,152,168.95,97.35,Ok,988.62,54.9,Above Average,3,1
1,83,Drive Kids,Clothing,Kids,177.41,270.57,327,93.16,52.51,High,988.62,54.9,Above Average,19,2
2,75,Training Kids,Clothing,Kids,100.26,192.93,443,92.67,92.43,High,988.62,54.9,Above Average,20,3
3,69,Old Men,Clothing,Men,176.46,263.92,90,87.46,49.56,Low,988.62,54.9,Above Average,25,4
4,91,Husband Kids,Clothing,Kids,104.33,181.91,441,77.58,74.36,High,988.62,54.9,Above Average,31,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,Brother Audio,Electronics,Audio,24.99,44.80,289,19.81,79.27,High,1182.55,59.1,below Average,80,16
96,81,Answer Laptop,Electronics,Laptop,29.85,43.18,470,13.33,44.66,High,1182.55,59.1,below Average,88,17
97,47,Clearly Mobile,Electronics,Mobile,23.90,36.92,226,13.02,54.48,High,1182.55,59.1,below Average,90,18
98,85,Figure Laptop,Electronics,Laptop,9.83,17.59,180,7.76,78.94,Ok,1182.55,59.1,below Average,95,19


### Employees

In [16]:
employees = _dntk.execute_sql(
  'with employees as (\nSELECT * \n,count(employee_id) over(PARTITION BY store_id) as employees_per_store -- calculates number of employees per store\n,datediff(\'year\', hire_date, today()) as years_since_hire -- Calculates years since hire date\n,dense_rank() OVER(order by datediff(\'year\', hire_date, today()) desc) as ranking_years_since_hire -- ranks employees by seniority level\n,round(count(employee_id) OVER(PARTITION BY position) / count(employee_id) over(),2) as perc_position_over_total -- percentage every postion over total \n,count(employee_id) over(partition by store_id, position) as employees_per_position_store\nFROM df_employees\n)\n\nselect *\n,case \n    when years_since_hire <= 2 then \'Junior\'\n    when years_since_hire > 2 and years_since_hire <= 4 then \'Mid\'\n    when years_since_hire > 4 then \'Senior\' end as seniority\n\nFROM employees',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
employees

Unnamed: 0,employee_id,full_name,hire_date,position,store_id,employees_per_store,years_since_hire,ranking_years_since_hire,perc_position_over_total,employees_per_position_store,seniority
0,144,Curtis Thomas,2023-11-21,Manager,4,7,2,6,0.31,2,Junior
1,94,Nancy Bradford,2023-07-12,Manager,4,7,2,6,0.31,2,Junior
2,19,Paul Smith,2022-09-03,Sales Rep,8,9,3,5,0.30,2,Mid
3,9,Dr. Kylie Hamilton,2018-03-18,Sales Rep,8,9,7,1,0.30,2,Senior
4,10,Rachel Sullivan,2023-05-05,Cashier,14,6,2,6,0.39,3,Junior
...,...,...,...,...,...,...,...,...,...,...,...
145,111,Brian Daniel,2020-07-24,Cashier,9,6,5,3,0.39,3,Senior
146,138,Wanda Walker,2021-02-21,Sales Rep,20,10,4,4,0.30,4,Mid
147,50,Roy Fowler,2024-02-23,Sales Rep,20,10,1,7,0.30,4,Junior
148,21,Mrs. Veronica Hughes MD,2018-07-27,Sales Rep,20,10,7,1,0.30,4,Senior


In [14]:
_dntk.execute_sql(
  'select position, store_id, count()\nfrom employees\ngroup by position, store_id\n',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)

Unnamed: 0,position,store_id,count_star()
0,Sales Rep,8,2
1,Cashier,14,3
2,Manager,18,4
3,Sales Rep,19,2
4,Sales Rep,4,3
5,Manager,9,1
6,Cashier,2,4
7,Cashier,10,4
8,Cashier,16,5
9,Manager,17,1


In [10]:
dates = _dntk.execute_sql(
  'SELECT * \nFROM df_dates',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
dates

Unnamed: 0,date,year,month,quarter,week,weekday,is_weekend
0,2022-01-01,2022,1,1,52,Saturday,True
1,2022-01-02,2022,1,1,52,Sunday,True
2,2022-01-03,2022,1,1,1,Monday,False
3,2022-01-04,2022,1,1,1,Tuesday,False
4,2022-01-05,2022,1,1,1,Wednesday,False
...,...,...,...,...,...,...,...
1091,2024-12-27,2024,12,4,52,Friday,False
1092,2024-12-28,2024,12,4,52,Saturday,True
1093,2024-12-29,2024,12,4,52,Sunday,True
1094,2024-12-30,2024,12,4,1,Monday,False


In [11]:
stores = _dntk.execute_sql(
  'SELECT * \nFROM df_stores',
  'SQL_DEEPNOTE_DATAFRAME_SQL',
  audit_sql_comment='',
  sql_cache_mode='cache_disabled',
  return_variable_type='dataframe'
)
stores

Unnamed: 0,store_id,store_name,store_type,region,city
0,1,Store 1,physical,North,South Brandon
1,2,Store 2,physical,North,Lake Nicholas
2,3,Store 3,online,South,North Curtisstad
3,4,Store 4,online,West,Vazqueztown
4,5,Store 5,online,West,New Michealshire
5,6,Store 6,online,West,Port Keithshire
6,7,Store 7,physical,East,North Jennifer
7,8,Store 8,physical,South,Braunshire
8,9,Store 9,physical,East,South Katelyn
9,10,Store 10,online,North,Wangshire


In [12]:
# customers['customer_decade'].value_counts()
customers.nunique()

customer_id                              2000
first_name                                462
last_name                                 691
gender                                      2
birth_date                               1903
signup_date                               906
city                                     1881
country                                   243
income_level                                3
customer_age                              522
customer_decade                             6
sign_up_year                                3
years_since_signup                          3
age_at_signup                              56
perc_income_level                           3
customers_per_country                      19
perc_customers_country_vs_total            11
customers_per_income_per_country           10
perc_customers_per_income_per_country      58
rank_customer_signup                       27
customers_cohort                            4
is_loyal                          

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d3ae251d-9bdf-4aa8-8817-46bba0993e76' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>