# FOOD WASTE MANAGEMENT

## DATA CLEANING

In [1]:
import pandas as pd
import numpy as np
import os

# Create output folder
os.makedirs("clean", exist_ok=True)
os.makedirs("rejects", exist_ok=True)

# === Step 1: Load raw CSVs ===
providers = pd.read_csv(r"C:\Users\junai\food_wastage_DB\providers_data.csv")
receivers = pd.read_csv(r"C:\Users\junai\food_wastage_DB\receivers_data.csv")
food_listings = pd.read_csv(r"C:\Users\junai\food_wastage_DB\food_listings_data.csv")
claims = pd.read_csv(r"C:\Users\junai\food_wastage_DB\claims_data.csv")

# === Step 2: Inspect ===
for name, df in {
    "Providers": providers,
    "Receivers": receivers,
    "Food Listings": food_listings,
    "Claims": claims
}.items():
    print(f"\n=== {name} ===")
    print(df.info())
    print(df.head())
    print(f"Rows: {len(df)} | Nulls:\n{df.isnull().sum()}")



=== Providers ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Provider_ID  1000 non-null   int64 
 1   Name         1000 non-null   object
 2   Type         1000 non-null   object
 3   Address      1000 non-null   object
 4   City         1000 non-null   object
 5   Contact      1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB
None
   Provider_ID                         Name           Type  \
0            1             Gonzales-Cochran    Supermarket   
1            2  Nielsen, Johnson and Fuller  Grocery Store   
2            3                 Miller-Black    Supermarket   
3            4   Clark, Prince and Williams  Grocery Store   
4            5               Coleman-Farley  Grocery Store   

                                             Address            City  \
0  74347 Christopher Extensions\nAndreamouth, O

## EXTENDED DATA CLEANING

In [2]:
# === Step A: Convert data types ===
food_listings["Expiry_Date"] = pd.to_datetime(food_listings["Expiry_Date"], errors="coerce")
claims["Timestamp"] = pd.to_datetime(claims["Timestamp"], errors="coerce")

# === Step B: Standardize categorical values ===
# Food Type
food_listings["Food_Type"] = food_listings["Food_Type"].str.strip().str.title()
food_listings["Food_Type"] = food_listings["Food_Type"].replace({
    "Non-Vegetarian": "Non-Veg",
    "Vegetarian": "Veg"
})

# Meal Type
food_listings["Meal_Type"] = food_listings["Meal_Type"].str.strip().str.title()
valid_meals = {"Breakfast", "Lunch", "Dinner", "Snacks"}
food_listings = food_listings[food_listings["Meal_Type"].isin(valid_meals)]

# Receiver Type
receivers["Type"] = receivers["Type"].str.strip().str.title()

# Provider Type
providers["Type"] = providers["Type"].str.strip().str.title()

# === Step C: Remove invalid rows ===
rejects = {}

# Food Listings - invalid quantities
rejects["food_invalid_qty"] = food_listings[food_listings["Quantity"] <= 0]
food_listings = food_listings[food_listings["Quantity"] > 0]

# Expiry date parse failures
rejects["food_invalid_expiry"] = food_listings[food_listings["Expiry_Date"].isna()]
food_listings = food_listings.dropna(subset=["Expiry_Date"])

# Claims - invalid statuses
valid_status = {"Pending", "Completed", "Cancelled"}
rejects["claims_invalid_status"] = claims[~claims["Status"].isin(valid_status)]
claims = claims[claims["Status"].isin(valid_status)]

# === Step D: Referential Integrity ===
# Provider IDs
invalid_provider_links = food_listings[~food_listings["Provider_ID"].isin(providers["Provider_ID"])]
rejects["invalid_provider_links"] = invalid_provider_links
food_listings = food_listings[food_listings["Provider_ID"].isin(providers["Provider_ID"])]

# Food IDs in claims
invalid_food_links = claims[~claims["Food_ID"].isin(food_listings["Food_ID"])]
rejects["invalid_food_links"] = invalid_food_links
claims = claims[claims["Food_ID"].isin(food_listings["Food_ID"])]

# Receiver IDs in claims
invalid_receiver_links = claims[~claims["Receiver_ID"].isin(receivers["Receiver_ID"])]
rejects["invalid_receiver_links"] = invalid_receiver_links
claims = claims[claims["Receiver_ID"].isin(receivers["Receiver_ID"])]

# === Step E: Export Clean Data ===
providers.to_csv("clean/providers.csv", index=False)
receivers.to_csv("clean/receivers.csv", index=False)
food_listings.to_csv("clean/food_listings.csv", index=False)
claims.to_csv("clean/claims.csv", index=False)

# Export rejects
for name, df in rejects.items():
    if not df.empty:
        df.to_csv(f"rejects/{name}.csv", index=False)

print("✅ Cleaning complete. Clean files saved in 'clean/' and rejects in 'rejects/'.")


✅ Cleaning complete. Clean files saved in 'clean/' and rejects in 'rejects/'.


## DATA CLEANING SUMMARY

In [3]:
# === Cleaning Summary Report ===

summary = {
    "Providers": {
        "Before": 1000,
        "After": len(providers)
    },
    "Receivers": {
        "Before": 1000,
        "After": len(receivers)
    },
    "Food Listings": {
        "Before": 1000,
        "After": len(food_listings)
    },
    "Claims": {
        "Before": 1000,
        "After": len(claims)
    }
}

print("=== Cleaning Summary ===")
for table, counts in summary.items():
    print(f"{table}: {counts['Before']} → {counts['After']} (Removed {counts['Before'] - counts['After']})")

print("\n=== Reject Files Saved ===")
for name, df in rejects.items():
    print(f"{name}: {len(df)} rows")


=== Cleaning Summary ===
Providers: 1000 → 1000 (Removed 0)
Receivers: 1000 → 1000 (Removed 0)
Food Listings: 1000 → 1000 (Removed 0)
Claims: 1000 → 1000 (Removed 0)

=== Reject Files Saved ===
food_invalid_qty: 0 rows
food_invalid_expiry: 0 rows
claims_invalid_status: 0 rows
invalid_provider_links: 0 rows
invalid_food_links: 0 rows
invalid_receiver_links: 0 rows
