In [30]:
import os
import re


from supabase import create_client, Client
supabase: Client = create_client(
    os.getenv("SUPABASE_URL"),
    os.getenv("SUPABASE_KEY")
)

In [31]:
import pandas as pd
from datetime import datetime

all_data = []
batch_size = 1000
offset = 0

while True:
       response = supabase.table("daily_report").select("*").range(offset, offset + batch_size - 1).execute()
       batch = response.data
       if not batch:
           break
       all_data.extend(batch)
       if len(batch) < batch_size:
           break
       offset += batch_size

df = pd.DataFrame(all_data)


In [32]:
df.shape

(37916, 75)

In [33]:
df.columns

Index(['id', 'Name JA partner', 'WH location', 'Type of WH (bonded/non)',
       'Container No.', 'Product type', 'Product reference', 'Port of Loading',
       'Port of destination', 'Inbound ref.', 'Import invoice', 'House B/l',
       'Bill of Lading', 'Shipping line', 'Vessel', 'ETD date POL',
       'ATD date POL', 'ETA date', 'ATA date', 'Import MRN', 'Import date',
       'Planned Inbound date', 'Inbound date',
       'Inbound duration days (Inbound date-ATA date+1)', 'Inbound Status',
       'Dev. Planned to Real in days (Inbound date-Planned inbound date',
       'Release date from port (ATA date)',
       'Contractual freetime for D&D combined', 'Free DM days', 'Free DT days',
       'Free DM days remained', 'Free DT days remained',
       'Container Returned date', 'Factory JASolar', 'Pallets', 'Piece',
       'Wattage', 'Total Wattage', 'MW', 'Stock Status', 'Stock age',
       'Release Number', 'Release type', 'Incoterm', 'Release date',
       'Internal Outbound ref', 'Ou

In [34]:
# Data cleaning and preprocessing

# 1. Remove rows where "Container No." is null or blank
df = df[df["Container No."].notnull() & (df["Container No."].astype(str).str.strip() != "")]

# 2. Remove all special characters and strip leading/trailing spaces from specified columns
def remove_special_chars(val):
    if pd.isnull(val):
        return val
    # Remove special characters and then strip leading/trailing spaces
    return re.sub(r'[^A-Za-z0-9 ]+', '', str(val)).strip()

for col in ["Container No.", "Release Number", "Import invoice"]:
    if col in df.columns:
        df[col] = df[col].apply(remove_special_chars)

# 3. Add new column "Power" as 'Piece' * 'Wattage'
if "Piece" in df.columns and "Wattage" in df.columns:
    df["Power"] = pd.to_numeric(df["Piece"], errors='coerce') * pd.to_numeric(df["Wattage"], errors='coerce')
else:
    df["Power"] = None

# 4. Add new column "MegaWattage" as (Power / 10^6)
df["MegaWattage"] = df["Power"] / 1_000_000

# 5. Create a new column "Ref1" as concat of "Container No." and "Release Number"
if "Container No." in df.columns and "Release Number" in df.columns:
    df["Ref1"] = df["Release Number"].astype(str) + df["Container No."].astype(str) 
else:
    df["Ref1"] = None

# 6. Create a new column "Ref2" as concat of "Container No." and "Release Number"
if "Container No." in df.columns and "Release Number" in df.columns:
    df["Ref2"] = df["Release Number"].astype(str) + df["Container No."].astype(str) +df["Wattage"].astype(str) 
else:
    df["Ref2"] = None

# 7. Add Status column based on Inbound and Outbound dates

today = datetime.now().date()

def get_status(row):
    if pd.isna(row['Outbound date']):
        if pd.isna(row['Inbound date']):
            return "On Sea"
        else:
            return "In-Stock"
    else:
        # Convert Outbound date to date object for comparison
        outbound_date = pd.to_datetime(row['Outbound date']).date() if pd.notna(row['Outbound date']) else None
        if outbound_date and outbound_date > today:
            return "In-Stock"
        else:
            return "Outbounded"

df['Current_Status'] = df.apply(get_status, axis=1)

# 8. Add Outbound status column based on Outbound date
def get_outbound_class(row):
    if pd.isna(row['Outbound date']) or row['Outbound date'] == pd.Timestamp(0):
        return "not-outbounded"
    elif pd.to_datetime(row['Outbound date']).date() > today:
        return "outbound-planned"
    else:
        return "outbounded"

df['Outbound_status'] = df.apply(get_outbound_class, axis=1)

# 9. Add Release Status column based on Release date and Current status
def get_release_status(row):
    if not pd.isna(row['Release date']) or row.get('Current_status') == "Outbounded":
        return "Released"
    else:
        return "Not released"

df['Release_Status'] = df.apply(get_release_status, axis=1)


# 10. Add Delivery Status column based on Ref1 count
ref_counts = df['Ref1'].map(df['Ref1'].value_counts())
df['Delivery_Status'] = ref_counts.apply(lambda x: "Partial_delivery" if x > 1 else "Full_delivery")


# Display the transformed DataFrame
print(df.shape)
df

(37887, 83)


Unnamed: 0,id,Name JA partner,WH location,Type of WH (bonded/non),Container No.,Product type,Product reference,Port of Loading,Port of destination,Inbound ref.,...,Comments,created_at,Power,MegaWattage,Ref1,Ref2,Current_Status,Outbound_status,Release_Status,Delivery_Status
0,515237,Abreu Logistics - Spain,Malveira,Non bonded fixed,PONU8056439,Modules,JAM72D40-595/MB,Shanghai Yangshan (China),LISBON,PONU8056439,...,,2025-05-07T13:39:11.630056+00:00,428400.0,0.42840,50801000004PONU8056439,50801000004PONU8056439595,Outbounded,outbounded,Released,Full_delivery
1,515238,Abreu Logistics - Spain,Malveira,Non bonded fixed,TEMU7771387,Modules,JAM72D40-595/MB,Shanghai Yangshan (China),LISBON,TEMU7771387,...,,2025-05-07T13:39:11.630056+00:00,428400.0,0.42840,50801000013TEMU7771387,50801000013TEMU7771387595,Outbounded,outbounded,Released,Full_delivery
2,515239,Abreu Logistics - Spain,Malveira,Non bonded fixed,HASU4948783,Modules,JAM72D40-595/MB,Shanghai Yangshan (China),LISBON,HASU4948783,...,,2025-05-07T13:39:11.630056+00:00,428400.0,0.42840,50801000016HASU4948783,50801000016HASU4948783595,Outbounded,outbounded,Released,Full_delivery
3,515240,Abreu Logistics - Spain,Malveira,Non bonded fixed,TCLU8689640,Modules,JAM72D40-595/MB,Shanghai Yangshan (China),LISBON,TCLU8689640,...,,2025-05-07T13:39:11.630056+00:00,428400.0,0.42840,50801000010TCLU8689640,50801000010TCLU8689640595,Outbounded,outbounded,Released,Full_delivery
4,515241,Abreu Logistics - Spain,Malveira,Non bonded fixed,TGHU8612910,Modules,JAM72D40-595/MB,Shanghai Yangshan (China),LISBON,TGHU8612910,...,,2025-05-07T13:39:11.630056+00:00,428400.0,0.42840,50801000005TGHU8612910,50801000005TGHU8612910595,Outbounded,outbounded,Released,Full_delivery
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37910,553147,Seacon,,,EGHU9154340,,JAM54D40-450/LB,,,,...,,2025-05-07T13:42:00.718554+00:00,421200.0,0.42120,2411200016EGHU9154340,2411200016EGHU9154340450,Outbounded,outbounded,Released,Partial_delivery
37911,553148,Seacon,,,CBHU7041702,,JAM54D40-460/LB,,,,...,,2025-05-07T13:42:00.718554+00:00,430560.0,0.43056,2504170021CBHU7041702,2504170021CBHU7041702460,On Sea,not-outbounded,Released,Full_delivery
37912,553149,Seacon,,,OOLU8764131,,JAM54D40-460/LB,,,,...,,2025-05-07T13:42:00.718554+00:00,430560.0,0.43056,2504170021OOLU8764131,2504170021OOLU8764131460,On Sea,not-outbounded,Released,Full_delivery
37913,553150,Access World ES,Valencia,Bonded,CMAU9304061,Modules,JAM60D42-525/LB,Ningbo,Valencia,,...,,2025-05-07T13:42:00.718554+00:00,415800.0,0.41580,2411000001CMAU9304061,2411000001CMAU9304061525,Outbounded,outbounded,Released,Full_delivery


In [35]:
# 11. Remove all data which has Outbound_status 'outbounded' and 'Outbound date' before 2024
def is_outbounded_after_2025(row):
    if row.get('Outbound_status') == 'outbounded' and pd.notna(row.get('Outbound date')):
        try:
            outbound_year = pd.to_datetime(row['Outbound date']).year
            return outbound_year < 2025
        except Exception:
            return False
    return False

df = df[~df.apply(is_outbounded_after_2025, axis=1)]

In [36]:
# 12. Update "Agreed Delivery date": blank if 2001, 2021, or 2024; keep blank if already blank
def fix_agreed_delivery_date(row):
    try:
        agreed_val = row.get('Agreed Delivery date')
        # If already blank or NaN, return blank
        if pd.isna(agreed_val) or str(agreed_val).strip() == "":
            return ""
        agreed_date = pd.to_datetime(agreed_val, errors='coerce')
        if pd.notna(agreed_date):
            if agreed_date.year in [2001, 2021, 2024]:
                return ""
    except Exception:
        pass
    return row.get('Agreed Delivery date')

if 'Agreed Delivery date' in df.columns:
    df.loc[:, 'Agreed Delivery date'] = df.apply(fix_agreed_delivery_date, axis=1)

# 13. Remove 'Total Wattage' and 'MW' columns
df = df.drop(['Total Wattage', 'MW'], axis=1)
df = df.where(pd.notnull(df), None) # Replace NaN with Python None (which becomes null in JSON)

In [37]:
if os.path.exists("output.csv"):
    os.remove("output.csv")

# Save the new DataFrame to output.csv
df.to_csv("output.csv", index=False)

In [39]:
df.columns
print(df.shape)


(11948, 81)


In [41]:
# First, ensure all NaN values are converted to None
df = df.replace({float('nan'): None})
df = df.where(pd.notnull(df), None)

# Convert DataFrame to list of dicts
records = df.to_dict(orient="records")
batch_size = 500

for i in range(0, len(records), batch_size):
    batch = records[i:i+batch_size]
    try:
        # Additional check to ensure no NaN values in the batch
        batch = [{k: (None if pd.isna(v) else v) for k, v in record.items()} for record in batch]
        response = supabase.table("current_report").insert(batch).execute()
        if not response.data:
            print(f"Error inserting batch {i//batch_size + 1}: {response.__dict__}")
        else:
            print(f"Batch {i//batch_size + 1} inserted successfully.")
    except Exception as e:
        print(f"Exception inserting batch {i//batch_size + 1}: {e}")

Batch 1 inserted successfully.
Batch 2 inserted successfully.
Batch 3 inserted successfully.
Batch 4 inserted successfully.
Batch 5 inserted successfully.
Batch 6 inserted successfully.
Batch 7 inserted successfully.
Batch 8 inserted successfully.
Batch 9 inserted successfully.
Batch 10 inserted successfully.
Batch 11 inserted successfully.
Batch 12 inserted successfully.
Batch 13 inserted successfully.
Batch 14 inserted successfully.
Batch 15 inserted successfully.
Batch 16 inserted successfully.
Batch 17 inserted successfully.
Batch 18 inserted successfully.
Batch 19 inserted successfully.
Batch 20 inserted successfully.
Batch 21 inserted successfully.
Batch 22 inserted successfully.
Batch 23 inserted successfully.
Batch 24 inserted successfully.


In [9]:
# Check for any "Agreed Delivery date" less than Jan 1st, 2025
if 'Agreed Delivery date' in df.columns:
    mask = pd.to_datetime(df['Agreed Delivery date'], errors='coerce') < pd.Timestamp('2025-01-01')
    has_early_dates = df[mask]
    print(f"Rows with 'Agreed Delivery date' before 2025-01-01: {len(has_early_dates)}")
    display(has_early_dates)  # For Jupyter, or use print(has_early_dates.head()) in scripts
else:
    print("'Agreed Delivery date' column not found in DataFrame.")

Rows with 'Agreed Delivery date' before 2025-01-01: 0


Unnamed: 0,id,Name JA partner,WH location,Type of WH (bonded/non),Container No.,Product type,Product reference,Port of Loading,Port of destination,Inbound ref.,...,Comments,created_at,Power,MegaWattage,Ref1,Ref2,Current_Status,Outbound_status,Release_Status,Delivery_Status


In [113]:
df.shape

(11758, 81)