In [None]:
import sqlite3
import pandas as pd
import os

# Step 1: Delete old database if it exists
db_path = "superstore.db"
if os.path.exists(db_path):
    os.remove(db_path)
    print("Old database deleted.")

# Step 2: Recreate the database and load the schema
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Enable foreign keys
cursor.execute("PRAGMA foreign_keys = ON;")

# Load schema.sql and execute it
with open("schema.sql", "r") as f:
    schema_sql = f.read()
cursor.executescript(schema_sql)
print("Database and tables created.")

# Step 3: Load the cleaned CSV
df = pd.read_csv("cleaned_data.csv")

# Step 4: Split data into tables
customers_df = df[['CustomerID', 'CustomerName', 'Segment']].drop_duplicates()
addresses_df = df[['Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()
products_df = df[['ProductID', 'Category', 'SubCategory', 'ProductName']].drop_duplicates(subset=['ProductID'])
orders_df = df[['OrderID', 'CustomerID', 'OrderDate']].drop_duplicates()
order_items_df = df[['OrderID', 'ProductID', 'Sales', 'Quantity', 'Discount', 'Profit']].copy()
shipments_df = df[['OrderID', 'AddressID', 'ShipDate', 'ShipMode']].drop_duplicates()

# Step 5: Insert data into tables
customers_df.to_sql("customers", conn, if_exists="append", index=False)
addresses_df.to_sql("addresses", conn, if_exists="append", index=False)
products_df.to_sql("products", conn, if_exists="append", index=False)
orders_df.to_sql("customer_orders", conn, if_exists="append", index=False)
order_items_df.to_sql("order_items", conn, if_exists="append", index=False)
shipments_df.to_sql("shipments", conn, if_exists="append", index=False)

conn.commit()
conn.close()

print("✅ All data inserted successfully.")


Old database deleted.
Database and tables created.
✅ All data inserted successfully.


## Recreate Shipping and Address tables

### Connect to the database and drop the old tables

In [None]:
# import sqlite3

# # Connect to your database
# conn = sqlite3.connect("superstore.db")
# cursor = conn.cursor()

# # Enable foreign keys
# cursor.execute("PRAGMA foreign_keys = ON;")

# # Step 1: Drop old tables
# cursor.execute("DROP TABLE IF EXISTS addresses;")
# cursor.execute("DROP TABLE IF EXISTS shipments;")
# print("Old 'addresses' and 'shipments' tables deleted.")

# conn.commit()


Old 'addresses' and 'shipments' tables deleted.


### Recreate the tables with AddressID

In [None]:
# import pandas as pd

# # Reload cleaned CSV
# df = pd.read_csv("cleaned_data.csv")

# # 1. Create AddressCombi
# df["AddressCombi"] = df.apply(lambda x: f"{x['City']} {x['State']} {x['Region']} {x['Country']} {x['PostalCode']}", axis=1)

# # 2. Create AddressID
# df["AddressID"] = pd.factorize(df["AddressCombi"])[0] + 1  # IDs start at 1

# # 3. Prepare new tables
# addresses_df = df[['AddressID', 'CustomerID', 'Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()
# shipments_df = df[['OrderID', 'ShipDate', 'ShipMode']].drop_duplicates()


# # 4. Insert into new tables
# addresses_df.to_sql("addresses", conn, if_exists="replace", index=False)
# shipments_df.to_sql("shipments", conn, if_exists="replace", index=False)

# conn.commit()
# conn.close()

# print("✅ 'addresses' and 'shipments' tables recreated and reinserted successfully.")


✅ 'addresses' and 'shipments' tables recreated and reinserted successfully.


### Add AddressID to shipments

<!-- ### Add AddressID to shipments -->

In [None]:
# import sqlite3
# import pandas as pd

# # Connect to the database again
# conn = sqlite3.connect("superstore.db")
# cursor = conn.cursor()

# # Enable foreign keys
# cursor.execute("PRAGMA foreign_keys = ON;")

# # Step 1: Drop the current shipments table
# cursor.execute("DROP TABLE IF EXISTS shipments;")
# print("Old 'shipments' table deleted.")

# # Step 2: Reload cleaned CSV
# df = pd.read_csv("cleaned_data.csv")

# # Step 3: Recreate AddressID
# df["AddressCombi"] = df.apply(lambda x: f"{x['City']} {x['State']} {x['Region']} {x['Country']} {x['PostalCode']}", axis=1)
# df["AddressID"] = pd.factorize(df["AddressCombi"])[0] + 1

# # Step 4: Recreate the shipments table, this time with AddressID
# shipments_df = df[['OrderID', 'ShipDate', 'ShipMode', 'AddressID']].drop_duplicates()

# # Step 5: Insert into the database
# shipments_df.to_sql("shipments", conn, if_exists="replace", index=False)

# conn.commit()
# conn.close()

# print("✅ 'shipments' table recreated successfully with AddressID!")


Old 'shipments' table deleted.
✅ 'shipments' table recreated successfully with AddressID!


### Verify that all AddressID values in your shipments table actually exist in the addresses table

In [None]:
# import sqlite3
# import pandas as pd

# # Connect to your database
# conn = sqlite3.connect("superstore.db")

# # Query to check if any AddressID in shipments is missing in addresses
# query = """
# SELECT s.OrderID, s.AddressID
# FROM shipments s
# LEFT JOIN addresses a ON s.AddressID = a.AddressID
# WHERE a.AddressID IS NULL;
# """

# missing_address_ids = pd.read_sql_query(query, conn)

# if missing_address_ids.empty:
#     print("✅ All AddressID values in 'shipments' are valid and match 'addresses'.")
# else:
#     print("❌ Found mismatches! Here are shipments with missing AddressIDs:")
#     print(missing_address_ids)

# conn.close()


✅ All AddressID values in 'shipments' are valid and match 'addresses'.


## Recreate database

### Clean Data to Ensure Uniqueness

In [None]:
# # Make sure AddressID is unique in the DataFrame
# addresses_df = addresses_df.drop_duplicates(subset=['AddressID'])

# # Insert into the addresses table
# addresses_df.to_sql("addresses", conn, if_exists="append", index=False)


632

### Reset AddressID Sequence (Optional)

In [None]:
# cursor.execute("DELETE FROM addresses")  # Delete all existing data
# cursor.execute("UPDATE sqlite_sequence SET seq = 0 WHERE name = 'addresses'")  # Reset AUTOINCREMENT counter


<sqlite3.Cursor at 0x238303e46c0>

In [None]:
# import sqlite3
# import pandas as pd
# import os

# # Step 1: Delete old database if it exists
# db_path = "superstore.db"
# if os.path.exists(db_path):
#     try:
#         # Close any existing connection to the database
#         if 'conn' in locals() and conn:
#             conn.close()
#         os.remove(db_path)
#         print("🗑️ Old database deleted.")
#     except PermissionError as e:
#         print(f"❌ Could not delete the database file: {e}")

# # Step 2: Recreate new database and tables
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Enable foreign keys (important!)
# cursor.execute("PRAGMA foreign_keys = ON;")

# # Load and execute updated schema.sql
# with open("schema.sql", "r") as f:
#     schema_sql = f.read()
# cursor.executescript(schema_sql)
# print("🏗️ New database and tables created.")

# # Step 3: Load cleaned CSV
# df = pd.read_csv("cleaned_data.csv")

# # Step 4: Prepare AddressID
# df["AddressCombi"] = df.apply(lambda x: f"{x['City']} {x['State']} {x['Region']} {x['Country']} {x['PostalCode']}", axis=1)
# df["AddressID"] = pd.factorize(df["AddressCombi"])[0] + 1  # IDs start at 1

# # Step 5: Create DataFrames
# customers_df = df[['CustomerID', 'CustomerName', 'Segment']].drop_duplicates()
# addresses_df = df[['AddressID', 'CustomerID', 'Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()
# products_df = df[['ProductID', 'Category', 'SubCategory', 'ProductName']].drop_duplicates(subset=['ProductID'])
# orders_df = df[['OrderID', 'CustomerID', 'OrderDate']].drop_duplicates()
# order_items_df = df[['OrderID', 'ProductID', 'Sales', 'Quantity', 'Discount', 'Profit']].copy()
# shipments_df = df[['OrderID', 'ShipDate', 'ShipMode', 'AddressID']].drop_duplicates()

# # Step 6: Insert into tables
# customers_df.to_sql("customers", conn, if_exists="append", index=False)
# addresses_df.to_sql("addresses", conn, if_exists="append", index=False)
# products_df.to_sql("products", conn, if_exists="append", index=False)
# orders_df.to_sql("customer_orders", conn, if_exists="append", index=False)
# order_items_df.to_sql("order_items", conn, if_exists="append", index=False)
# shipments_df.to_sql("shipments", conn, if_exists="append", index=False)

# conn.commit()
# conn.close()

# print("✅ All data inserted successfully.")

🗑️ Old database deleted.
🏗️ New database and tables created.


IntegrityError: UNIQUE constraint failed: addresses.AddressID

In [None]:
# # Re-check for Duplicates
# addresses_df = addresses_df.drop_duplicates(subset=['AddressID'])


In [None]:
# # Check the unique AddressCombi values
# print(df["AddressCombi"].nunique())  # This should show a large number of unique addresses

# # Check if there are duplicates in AddressID
# print(df["AddressID"].duplicated().sum())  # This should show 0 if no duplicates


632
9362


In [None]:
# import sqlite3
# import pandas as pd
# import os

# # Step 1: Delete old database if it exists
# db_path = "superstore.db"
# if os.path.exists(db_path):
#     try:
#         if 'conn' in locals() and conn:
#             conn.close()
#         os.remove(db_path)
#         print("🗑️ Old database deleted.")
#     except PermissionError as e:
#         print(f"❌ Could not delete the database file: {e}")

# # Step 2: Recreate new database and tables
# conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# # Enable foreign keys (important!)
# cursor.execute("PRAGMA foreign_keys = ON;")

# # Load and execute updated schema.sql
# with open("schema.sql", "r") as f:
#     schema_sql = f.read()
# cursor.executescript(schema_sql)
# print("🏗️ New database and tables created.")

# # Step 3: Load cleaned CSV
# df = pd.read_csv("cleaned_data.csv")

# # Step 4: Prepare AddressCombi and AddressID
# df["AddressCombi"] = df.apply(lambda x: f"{x['City']} {x['State']} {x['Region']} {x['Country']} {x['PostalCode']}", axis=1)

# # Ensure AddressID is unique
# df["AddressID"] = df.index + 1  # Using the index to generate a unique AddressID

# # Step 5: Create DataFrames
# # Ensure there are no duplicate CustomerID values
# customers_df = df[['CustomerID', 'CustomerName', 'Segment']].drop_duplicates(subset=['CustomerID'])
# products_df = df[['ProductID', 'Category', 'SubCategory', 'ProductName']].drop_duplicates(subset=['ProductID'])
# orders_df = df[['OrderID', 'CustomerID', 'OrderDate']].drop_duplicates()
# order_items_df = df[['OrderID', 'ProductID', 'Sales', 'Quantity', 'Discount', 'Profit']].copy()
# # Aggregate shipments_df to ensure unique OrderID values
# shipments_df = df.groupby('OrderID').agg({
#     'ShipDate': 'first',  # Use the first ShipDate for each OrderID
#     'ShipMode': 'first',  # Use the first ShipMode for each OrderID
#     'AddressID': 'first'  # Use the first AddressID for each OrderID
# }).reset_index()

# # Step 6: Insert into tables
# customers_df.to_sql("customers", conn, if_exists="append", index=False)
# addresses_df = df[['AddressID', 'CustomerID', 'Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()
# addresses_df.to_sql("addresses", conn, if_exists="append", index=False)
# products_df.to_sql("products", conn, if_exists="append", index=False)
# orders_df.to_sql("customer_orders", conn, if_exists="append", index=False)
# order_items_df.to_sql("order_items", conn, if_exists="append", index=False)
# shipments_df.to_sql("shipments", conn, if_exists="append", index=False)

# # Step 7: Commit and close connection
# conn.commit()
# conn.close()

# print("✅ All data inserted successfully.")


🗑️ Old database deleted.
🏗️ New database and tables created.
✅ All data inserted successfully.


In [1]:
import sqlite3
import pandas as pd
import os

# Step 1: Delete old database if it exists
db_path = "superstore.db"
if os.path.exists(db_path):
    os.remove(db_path)
    print("Old database deleted.")

# Step 2: Recreate the database and load the schema
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Enable foreign keys
cursor.execute("PRAGMA foreign_keys = ON;")

# Load schema.sql and execute it
with open("schema.sql", "r") as f:
    schema_sql = f.read()
cursor.executescript(schema_sql)
print("Database and tables created.")

# Step 3: Load the cleaned CSV
df = pd.read_csv("cleaned_data.csv")

# Step 4: Split data into tables
customers_df = df[['CustomerID', 'CustomerName', 'Segment']].drop_duplicates()
addresses_df = df[['Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()
products_df = df[['ProductID', 'Category', 'SubCategory', 'ProductName']].drop_duplicates(subset=['ProductID'])
orders_df = df[['OrderID', 'CustomerID', 'OrderDate']].drop_duplicates()
order_items_df = df[['OrderID', 'ProductID', 'Sales', 'Quantity', 'Discount', 'Profit']].copy()

# Step 5: Insert data into tables except shipments (for now)
customers_df.to_sql("customers", conn, if_exists="append", index=False)
addresses_df.to_sql("addresses", conn, if_exists="append", index=False)
products_df.to_sql("products", conn, if_exists="append", index=False)
orders_df.to_sql("customer_orders", conn, if_exists="append", index=False)
order_items_df.to_sql("order_items", conn, if_exists="append", index=False)

# Step 6: Now prepare the shipments table
# Fetch addresses with their newly assigned AddressID
addresses_in_db = pd.read_sql_query("SELECT AddressID, Country, City, State, PostalCode, Region FROM addresses", conn)

# Prepare shipments dataframe
shipments_df = df[['OrderID', 'ShipDate', 'ShipMode', 'Country', 'City', 'State', 'PostalCode', 'Region']].drop_duplicates()

# 🔥 Force PostalCode to string type in both
shipments_df['PostalCode'] = shipments_df['PostalCode'].astype(str)
addresses_in_db['PostalCode'] = addresses_in_db['PostalCode'].astype(str)

# Merge shipments with addresses_in_db to get AddressID
shipments_df = shipments_df.merge(addresses_in_db, on=['Country', 'City', 'State', 'PostalCode', 'Region'], how='left')

# Now keep only the correct columns for inserting into the 'shipments' table
shipments_final_df = shipments_df[['OrderID', 'ShipDate', 'ShipMode', 'AddressID']]

# Insert into shipments
shipments_final_df.to_sql("shipments", conn, if_exists="append", index=False)

conn.commit()
conn.close()

print("✅ All data inserted successfully.")


Old database deleted.
Database and tables created.
✅ All data inserted successfully.
