In [2]:
import os
from dotenv import load_dotenv
import mysql.connector

# Load environment variables from .env file
load_dotenv()

# Read credentials
host = os.getenv("DB_HOST")
port = int(os.getenv("DB_PORT"))
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_NAME")

# Connect to MySQL
connection = mysql.connector.connect(
    host=host,
    user=user,
    password=password,
    port=port
)

cursor = connection.cursor()

# Create database if not exists
cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}")
print(f"Database '{database}' created successfully!")

Database 'ecommerce_db' created successfully!


### Downloads dataset from kaggle

In [6]:
import os
import shutil
import kagglehub

# Step 1: Set local data directory
data_dir = "data"

# Create directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Created directory: {data_dir}")

# Step 2: Download dataset
dataset_path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")
print("Downloaded to default cache path:", dataset_path)

# Step 3: Move downloaded files to our data directory
dest_path = os.path.join(data_dir, "brazilian-ecommerce")
if os.path.exists(dest_path):
    print(f"Destination path already exists: {dest_path}, removing old folder")
    shutil.rmtree(dest_path)  # optional: overwrite old download

shutil.move(dataset_path, dest_path)
print(f"Dataset moved to: {dest_path}")

Downloaded to default cache path: /Users/hrs3413/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2
Dataset moved to: data/brazilian-ecommerce


In [10]:
import os
import pandas as pd
from sqlalchemy import create_engine

# Base data directory
data_dir = "data/brazilian-ecommerce"

# CSV files
customers_csv = os.path.join(data_dir, "olist_customers_dataset.csv")
orders_csv = os.path.join(data_dir, "olist_orders_dataset.csv")
order_items_csv = os.path.join(data_dir, "olist_order_items_dataset.csv")
order_payments_csv = os.path.join(data_dir, "olist_order_payments_dataset.csv")
order_reviews_csv = os.path.join(data_dir, "olist_order_reviews_dataset.csv")
sellers_csv = os.path.join(data_dir, "olist_sellers_dataset.csv")
products_csv = os.path.join(data_dir, "olist_products_dataset.csv")
product_category_translation_csv = os.path.join(data_dir, "product_category_name_translation.csv")


# MySQL connection
engine = create_engine('mysql+mysqlconnector://heena:heena_test@localhost/ecommerce_db')

In [9]:
# Customers
df_customers = pd.read_csv(customers_csv)
df_customers.to_sql(name='customer', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'customer' table")

Data has been written to the 'customer' table


In [11]:
#Orders
df_orders = pd.read_csv(order_items_csv)
df_orders.to_sql(name='orders', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'orders' table")

# Order items
df_order_items = pd.read_csv(order_items_csv)
df_order_items.to_sql(name='order_items', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'order_items' table")

# Order payments
df_order_payments = pd.read_csv(order_payments_csv)
df_order_payments.to_sql(name='order_payments', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'order_payments' table")

# Order reviews
df_order_payments = pd.read_csv(order_reviews_csv)
df_order_payments.to_sql(name='order_reviews', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'order_reviews' table")

Data has been written to the 'orders' table
Data has been written to the 'order_items' table
Data has been written to the 'order_payments' table
Data has been written to the 'order_reviews' table


In [12]:
# Sellers
df_sellers = pd.read_csv(sellers_csv)
df_sellers.to_sql(name='sellers', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'sellers' table")

Data has been written to the 'sellers' table


In [13]:
#Products
df_products = pd.read_csv(products_csv)
df_products.to_sql(name='products', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'products' table")

# Products_translation
df_sellers = pd.read_csv(product_category_translation_csv)
df_sellers.to_sql(name='product_category_translation', con=engine, index=False, if_exists='replace')
print("Data has been written to the 'product_category_translation_csv' table")


Data has been written to the 'products' table
Data has been written to the 'product_category_translation_csv' table
