In [2]:
import sqlite3
import pandas as pd
import os

In [21]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_corporate_customers = """
   SELECT CustomerID, CustomerName, Segment FROM customers WHERE Segment = 'Corporate'
"""

cur.execute(select_corporate_customers)
rows = cur.fetchall()

corporate_df = pd.DataFrame(rows, columns=["CustomerID", "CustomerName", "Segment"])

cur.close()
conn.close()



In [22]:
corporate_df

Unnamed: 0,CustomerID,CustomerName,Segment
0,DV-13045,Darrin Van Huff,Corporate
1,KB-16585,Ken Black,Corporate
2,GH-14485,Gene Hale,Corporate
3,LC-16930,Linda Cazamias,Corporate
4,RA-19885,Ruben Ausman,Corporate
...,...,...,...
231,TC-21145,Theresa Coyne,Corporate
232,CM-12715,Craig Molinari,Corporate
233,FW-14395,Fred Wasserman,Corporate
234,HE-14800,Harold Engle,Corporate


## What is the category generating the maximum sales revenue? 

In [23]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_max_sales = """

SELECT p.Category, Sum(o_i.Sales) AS Profit
FROM products p
JOIN order_items o_i ON p.ProductID = o_i.ProductID
GROUP BY p.Category
ORDER BY Profit DESC
LIMIT 1

"""

cur.execute(select_max_sales)
rows = cur.fetchall()

max_sales_df = pd.DataFrame(rows, columns=["Category", "Sales"])

cur.close()
conn.close()

max_sales_df

Unnamed: 0,Category,Sales
0,Technology,836154.033


In [24]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_max_sales = """

SELECT p.Category
FROM products p
JOIN order_items o_i ON p.ProductID = o_i.ProductID
ORDER BY o_i.Profit DESC
LIMIT 1

"""

cur.execute(select_max_sales)
rows = cur.fetchall()

max_sales_df = pd.DataFrame(rows, columns=["Category"])

cur.close()
conn.close()

max_sales_df

Unnamed: 0,Category
0,Technology


### Profit in the technology category

In [25]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_technology_profit = """

SELECT p.Category, sum(o_i.Profit)
FROM products p
JOIN order_items o_i ON p.ProductID = o_i.ProductID
WHERE p.Category = 'Technology'

"""

cur.execute(select_technology_profit)
rows = cur.fetchall()

technology_profit_df = pd.DataFrame(rows, columns=["Category", "Profit"])

cur.close()
conn.close()

technology_profit_df

Unnamed: 0,Category,Profit
0,Technology,145454.9481


### Are they making a loss in any categories?

In [26]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_technology_subcategory = """

SELECT p.SubCategory, Max(o_i.Sales), o_i.Profit
FROM products p
JOIN order_items o_i ON p.ProductID = o_i.ProductID
WHERE p.Category = 'Technology'
GROUP BY p.SubCategory
"""

cur.execute(select_technology_subcategory)
rows = cur.fetchall()

technology_subcategory_df = pd.DataFrame(rows, columns=["SubCategory", "Sales", "Profit",])

cur.close()
conn.close()

technology_subcategory_df

Unnamed: 0,SubCategory,Sales,Profit
0,Accessories,3347.37,636.0003
1,Copiers,17499.95,8399.976
2,Machines,22638.48,-1811.0784
3,Phones,4548.81,1228.1787


## What are 5 states generating the maximum and minimum sales revenue?

In [27]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_state_max_sales = """

SELECT a.State, sum(o_i.Sales) AS Sales
FROM addresses a
JOIN customer_orders co ON a.CustomerID = co.CustomerID
JOIN order_items o_i ON co.OrderID = o_i.OrderID
GROUP BY a.State
ORDER BY o_i.Sales DESC
LIMIT 5


"""

cur.execute(select_state_max_sales)
rows = cur.fetchall()

state_max_sales_df = pd.DataFrame(rows, columns=["State","Sales"])

cur.close()
conn.close()

state_max_sales_df

Unnamed: 0,State,Sales
0,Washington,862298.7
1,New York,1859365.0
2,Colorado,254681.0
3,West Virginia,11405.48
4,South Carolina,86100.77


In [28]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_state_min_sales = """

SELECT a.State, o_i.Sales
FROM addresses a
JOIN customer_orders co ON a.CustomerID = co.CustomerID
JOIN order_items o_i ON co.OrderID = o_i.OrderID
GROUP BY a.State
ORDER BY o_i.Sales ASC
LIMIT 5


"""

cur.execute(select_state_min_sales)
rows = cur.fetchall()

state_min_sales_df = pd.DataFrame(rows, columns=["State","Sales"])

cur.close()
conn.close()

state_min_sales_df

Unnamed: 0,State,Sales
0,Indiana,8.56
1,Idaho,8.82
2,District of Columbia,10.824
3,California,14.62
4,Florida,14.62


## What are the 3 products in each product segment with the highest sales? 

In [29]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_top3_product = """

WITH ranked_product AS (
    SELECT 
        p.ProductName, 
        p.Category, 
        sum(o_i.Sales),
        RANK() OVER (PARTITION BY p.Category ORDER BY sum(o_i.Sales) DESC) AS sales_rank
    FROM products p
    JOIN order_items o_i ON o_i.ProductID = p.ProductID
    GROUP BY p.ProductName  
)
SELECT *
FROM ranked_product
WHERE sales_rank <= 3


"""

cur.execute(select_top3_product)
rows = cur.fetchall()

top3_product_df = pd.DataFrame(rows, columns=["ProductName", "Category", "Sales", "sales_rank"])

cur.close()
conn.close()

top3_product_df

Unnamed: 0,ProductName,Category,Sales,sales_rank
0,HON 5400 Series Task Chairs for Big and Tall,Furniture,21870.576,1
1,"Riverside Palais Royal Lawyers Bookcase, Royal...",Furniture,15610.9656,2
2,Bretford Rectangular Conference Table Tops,Furniture,12995.2915,3
3,Fellowes PB500 Electric Punch Plastic Comb Bin...,Office Supplies,27453.384,1
4,GBC DocuBind TL300 Electric Binding System,Office Supplies,19823.479,2
5,GBC Ibimaster 500 Manual ProClick Binding System,Office Supplies,19024.5,3
6,Canon imageCLASS 2200 Advanced Copier,Technology,61599.824,1
7,Cisco TelePresence System EX90 Videoconferenci...,Technology,22638.48,2
8,Hewlett Packard LaserJet 3310 Copier,Technology,18839.686,3


### Are they the 3 most profitable products as well?

In [30]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_top3_profit = """

WITH ranked_product AS (
    SELECT 
        p.ProductName, 
        p.Category, 
        sum(o_i.Profit),
        RANK() OVER (PARTITION BY p.Category ORDER BY o_i.Profit DESC) AS profit_rank
    FROM products p
    JOIN order_items o_i ON o_i.ProductID = p.ProductID 
    GROUP BY p.ProductName     
)
SELECT *
FROM ranked_product
WHERE profit_rank <= 3


"""

cur.execute(select_top3_profit)
rows = cur.fetchall()

top3_profit_df = pd.DataFrame(rows, columns=["ProductName", "Category", "Profit", "Profit_rank"])


top3_profit_df

Unnamed: 0,ProductName,Category,Profit,Profit_rank
0,Office Star - Professional Matrix Back Chair w...,Furniture,1305.6456,1
1,"Chromcraft 48"" x 96"" Racetrack Double Pedestal...",Furniture,-404.0064,2
2,Global Comet Stacking Arm Chair,Furniture,697.1415,3
3,Ibico EPK-21 Electric Binding System,Office Supplies,3345.2823,1
4,Fellowes PB500 Electric Punch Plastic Comb Bin...,Office Supplies,7753.039,2
5,GBC DocuBind P400 Electric Binding System,Office Supplies,-1878.1662,3
6,Canon imageCLASS 2200 Advanced Copier,Technology,25199.928,1
7,Ativa V4110MDD Micro-Cut Shredder,Technology,3772.9461,2
8,"3D Systems Cube Printer, 2nd Generation, Magenta",Technology,3717.9714,3


## What are the 3 best-seller products in each product segment? (Quantity-wise)

In [31]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_top3_seller = """

WITH best_seller AS (
    SELECT 
        p.ProductName, 
        p.Category, 
        sum(o_i.Quantity),
        ROW_NUMBER() OVER (PARTITION BY p.Category ORDER BY sum(o_i.Quantity) DESC) AS quantity_rank
    FROM products p
    JOIN order_items o_i ON o_i.ProductID = p.ProductID 
    GROUP BY p.ProductName     
)
SELECT *
FROM best_seller
WHERE quantity_rank <= 3


"""

cur.execute(select_top3_seller)
rows = cur.fetchall()

top3_seller_df = pd.DataFrame(rows, columns=["ProductName", "Category", "Quantity", "Quantity_rank"])


top3_seller_df

Unnamed: 0,ProductName,Category,Quantity,Quantity_rank
0,KI Adjustable-Height Table,Furniture,74,1
1,"Situations Contoured Folding Chairs, 4/Set",Furniture,64,2
2,Staple-based wall hangings,Furniture,62,3
3,Staples,Office Supplies,215,1
4,Staple envelope,Office Supplies,170,2
5,Easy-staple paper,Office Supplies,150,3
6,Logitech P710e Mobile Speakerphone,Technology,75,1
7,Logitech G19 Programmable Gaming Keyboard,Technology,60,2
8,Kingston Digital DataTraveler 16GB USB 2.0,Technology,57,3


## What are the top 3 worst-selling products in every category? (Quantity-wise)

In [32]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

select_flop3_seller = """

WITH best_seller AS (
    SELECT 
        p.ProductName, 
        p.Category, 
        sum(o_i.Quantity),
        ROW_NUMBER() OVER (PARTITION BY p.Category ORDER BY sum(o_i.Quantity) ASC) AS quantity_rank
    FROM products p
    JOIN order_items o_i ON o_i.ProductID = p.ProductID 
    GROUP BY p.ProductName     
)
SELECT *
FROM best_seller
WHERE quantity_rank <= 3


"""

cur = conn.cursor()  # Reopen the cursor
cur.execute(select_flop3_seller)
rows = cur.fetchall()

flop3_seller_df = pd.DataFrame(rows, columns=["ProductName", "Category", "Quantity", "Quantity_rank"])

cur.close()
conn.close()

flop3_seller_df

Unnamed: 0,ProductName,Category,Quantity,Quantity_rank
0,"Bush Saratoga Collection 5-Shelf Bookcase, Han...",Furniture,1,1
1,Global Enterprise Series Seating Low-Back Swiv...,Furniture,1,2
2,Barricks Non-Folding Utility Table with Steel ...,Furniture,2,3
3,Boston 1900 Electric Pencil Sharpener,Office Supplies,1,1
4,Xerox 20,Office Supplies,1,2
5,Avery 5,Office Supplies,2,3
6,Penpower WorldCard Pro Card Scanner,Technology,1,1
7,Canon imageCLASS MF7460 Monochrome Digital Las...,Technology,2,2
8,Hewlett-Packard Deskjet F4180 All-in-One Color...,Technology,2,3


## How many unique customers per month are there for the year 2016.

### Add year and month columns

In [33]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

# Add Year and Month columns to the customer_orders table
cur.execute("ALTER TABLE customer_orders ADD COLUMN Year INTEGER;")
cur.execute("ALTER TABLE customer_orders ADD COLUMN Month INTEGER;")

# Update the Year and Month columns based on the OrderDate
cur.execute("""
UPDATE customer_orders
SET
    Year = CAST(strftime('%Y', OrderDate) AS INTEGER),
    Month = CAST(strftime('%m', OrderDate) AS INTEGER);
""")


# Select the updated data
cur.execute("SELECT OrderDate, Year, Month FROM customer_orders")
rows = cur.fetchall()

# Create a DataFrame from the result
year_month_df = pd.DataFrame(rows, columns=["OrderDate", "Year", "Month"])

cur.close()
conn.close()

year_month_df

OperationalError: duplicate column name: Year

In [None]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

# Update the Year and Month columns based on the OrderDate
cur.execute("""
UPDATE customer_orders
SET
    Year = CAST(strftime('%Y', OrderDate) AS INTEGER),
    Month = CAST(strftime('%m', OrderDate) AS INTEGER);
""")

# Commit the changes
conn.commit()

# Select the updated data
cur.execute("SELECT OrderDate, Year, Month FROM customer_orders")
rows = cur.fetchall()

# Create a DataFrame from the result
year_month_df = pd.DataFrame(rows, columns=["OrderDate", "Year", "Month"])

cur.close()
conn.close()

year_month_df


Unnamed: 0,OrderDate,Year,Month
0,2016-11-08,2016,11
1,2016-06-12,2016,6
2,2015-10-11,2015,10
3,2014-06-09,2014,6
4,2017-04-15,2017,4
...,...,...,...
5004,2016-09-29,2016,9
5005,2017-11-17,2017,11
5006,2014-01-21,2014,1
5007,2017-02-26,2017,2


In [None]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

# Update the Year and Month columns based on the OrderDate
cur.execute("""
SELECT count(DISTINCT customerID) as Customer_Count, Year, Month
FROM customer_orders
WHERE Year = 2016
GROUP BY Month
""")

# Fetch the results of the query
rows = cur.fetchall()

# Create a DataFrame from the result
customers_2016_df = pd.DataFrame(rows, columns=["Customer_Count", "Year", "Month"])

cur.close()
conn.close()

customers_2016_df

Unnamed: 0,Customer_Count,Year,Month
0,46,2016,1
1,42,2016,2
2,80,2016,3
3,83,2016,4
4,96,2016,5
5,90,2016,6
6,89,2016,7
7,86,2016,8
8,176,2016,9
9,95,2016,10


### Join all

In [4]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

join_all = """
    SELECT 
        c.CustomerName,
        c.CustomerID,
        c.Segment,

        a.City,
        a.State,
        a.Region,
        a.Country,
        a.PostalCode,

        co.OrderID,
        co.OrderDate,
    
        s.ShipDate,
        s.ShipMode,

        oi.OrderID,
        oi.Sales,  
        oi.Discount,
        oi.Profit,     
        oi.Quantity,

        p.ProductName,
        p.Category,
        p.SubCategory

    FROM customer_orders co
    JOIN customers c ON co.CustomerID = c.CustomerID
    JOIN shipments s ON co.OrderID = s.OrderID
    JOIN addresses a ON s.AddressID = a.AddressID
    JOIN order_items oi ON co.OrderID = oi.OrderID
    JOIN products p ON oi.ProductID = p.ProductID
      """

cur.execute(join_all)
join_all_result = pd.DataFrame(cur.fetchall(), columns=["CustomerName", "CustomerID", "Segment", "City", "State",
                                                        "Region", "Country", "PostalCode", "OrderID", "OrderDate",
                                                        "ShipMode", "ShipDate", "Category", "SubCategory", "ProductID",
                                                        "ProductName", "Sales", "Discount", "Profit", "Quantity"])
join_all_result.to_csv("sqlized_superstore.csv", index=False)

#### Corrected Code (correct order)

In [3]:
conn = sqlite3.connect("superstore.db")
cur = conn.cursor()

join_all = """
    SELECT 
        c.CustomerName,
        c.CustomerID,
        c.Segment,

        a.City,
        a.State,
        a.Region,
        a.Country,
        a.PostalCode,

        co.OrderID,
        co.OrderDate,
    
        s.ShipDate,
        s.ShipMode,

        oi.Sales,  
        oi.Discount,
        oi.Profit,     
        oi.Quantity,

        p.ProductName,
        p.Category,
        p.SubCategory

    FROM customer_orders co
    JOIN customers c ON co.CustomerID = c.CustomerID
    JOIN shipments s ON co.OrderID = s.OrderID
    JOIN addresses a ON s.AddressID = a.AddressID
    JOIN order_items oi ON co.OrderID = oi.OrderID
    JOIN products p ON oi.ProductID = p.ProductID
"""

cur.execute(join_all)

join_all_result = pd.DataFrame(cur.fetchall(), columns=[
    "CustomerName", "CustomerID", "Segment",
    "City", "State", "Region", "Country", "PostalCode",
    "OrderID", "OrderDate",
    "ShipDate", "ShipMode",
    "Sales", "Discount", "Profit", "Quantity",
    "ProductName", "Category", "SubCategory"
])

join_all_result.to_csv("sqlized_superstore.csv", index=False, decimal=",")


In [4]:
df_from_csv = pd.read_csv("sqlized_superstore.csv")
print(df_from_csv)

          CustomerName CustomerID    Segment             City       State  \
0          Claire Gute   CG-12520   Consumer        Henderson    Kentucky   
1          Claire Gute   CG-12520   Consumer        Henderson    Kentucky   
2      Darrin Van Huff   DV-13045  Corporate      Los Angeles  California   
3       Sean O'Donnell   SO-20335   Consumer  Fort Lauderdale     Florida   
4       Sean O'Donnell   SO-20335   Consumer  Fort Lauderdale     Florida   
...                ...        ...        ...              ...         ...   
9989  Tom Boeckenhauer   TB-21400   Consumer            Miami     Florida   
9990       Dave Brooks   DB-13060   Consumer       Costa Mesa  California   
9991       Dave Brooks   DB-13060   Consumer       Costa Mesa  California   
9992       Dave Brooks   DB-13060   Consumer       Costa Mesa  California   
9993      Chris Cortes   CC-12220   Consumer      Westminster  California   

     Region        Country  PostalCode         OrderID   OrderDate  \
0    

In [6]:
df_from_csv.head()

Unnamed: 0,CustomerName,CustomerID,Segment,City,State,Region,Country,PostalCode,OrderID,OrderDate,ShipDate,ShipMode,Sales,Discount,Profit,Quantity,ProductName,Category,SubCategory
0,Claire Gute,CG-12520,Consumer,Henderson,Kentucky,South,United States,42420,CA-2016-152156,2016-11-08,2016-11-11,Second Class,26196,0,419136,2,Bush Somerset Collection Bookcase,Furniture,Bookcases
1,Claire Gute,CG-12520,Consumer,Henderson,Kentucky,South,United States,42420,CA-2016-152156,2016-11-08,2016-11-11,Second Class,73194,0,219582,3,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs
2,Darrin Van Huff,DV-13045,Corporate,Los Angeles,California,West,United States,90036,CA-2016-138688,2016-06-12,2016-06-16,Second Class,1462,0,68714,2,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels
3,Sean O'Donnell,SO-20335,Consumer,Fort Lauderdale,Florida,South,United States,33311,US-2015-108966,2015-10-11,2015-10-18,Standard Class,9575775,45,-383031,5,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables
4,Sean O'Donnell,SO-20335,Consumer,Fort Lauderdale,Florida,South,United States,33311,US-2015-108966,2015-10-11,2015-10-18,Standard Class,22368,2,25164,2,Eldon Fold 'N Roll Cart System,Office Supplies,Storage
