In [None]:
import pandas as pd
import requests
import re

def process_type_debris(data):
    # If 'type_debris' is empty, fill with "Unknown"
    data['type_debris'] = data['type_debris'].fillna('Unknown')
    
    # Replace "Mix" with "Mix/Other"
    data['type_debris'] = data['type_debris'].replace('Mix', 'Mix/Other')
    
    return data

In [None]:
def process_waste_quantity(data):
    # Regular expression pattern for valid waste quantities
    pattern = re.compile(r'^\d+(\.\d+)? (Cubic Yards|Tons)$')
    
    # Fill NaN values with "Unknown"
    data['waste_quantity'] = data['waste_quantity'].fillna('Unknown')
    
    # Check format and apply the "Unconfirmed" label if incorrect
    data['waste_quantity'] = data['waste_quantity'].apply(
        lambda x: "Unconfirmed" if (x != "Unknown" and not pattern.match(x)) else x
    )
    
    return data

In [None]:
file_path = 'cdw_csv_processed_1st.csv'
data = pd.read_csv(file_path)

data = process_type_debris(data)
data = process_waste_quantity(data)

data_copy = data.iloc[1000:2000].copy()

def validate_address(full_address):
    url = "https://addressvalidation.googleapis.com/v1:validateAddress"
    
    
    # Replace with YOUR API KEY
    api_key = "AIzaSyBVjcmfyc0kBHiR5Tx3G0ayefCntDIXNMc"
    
    
    
    headers = {'Content-Type': 'application/json'}
    payload = {"address": {"addressLines": [full_address]}}
    response = requests.post(f"{url}?key={api_key}", json=payload, headers=headers)
    if response.status_code == 200:
        data = response.json()
        geocode = data['result']['geocode']['location']
        address_components = data['result']['address']['addressComponents']
        all_confirmed = all(comp['confirmationLevel'] == 'CONFIRMED' for comp in address_components)
        confidence_status = 'confirmed' if all_confirmed else 'unconfirmed'
        return geocode['latitude'], geocode['longitude'], confidence_status
    else:
        return None, None, 'error'

def update_address_data(row, address_type):
    # Decide which columns to use based on address_type
    if address_type == 'pickup':
        address_cols = ['pickup_lat', 'pickup_lng', 'pickup_geocode_confidence']
    else:
        address_cols = ['receiving_lat', 'receiving_lng', 'receiving_geocode_confidence']
    
    # Construct full address string
    full_address = f"{row[address_type + '_address']}, {row[address_type + '_city']}, {row[address_type + '_state']} {row[address_type + '_zip']}"
    
    # Skip validation if lat and lng are already filled
    if pd.notnull(row[address_cols[0]]) and pd.notnull(row[address_cols[1]]):
        return pd.Series({
            address_cols[0]: row[address_cols[0]],
            address_cols[1]: row[address_cols[1]],
            address_cols[2]: row[address_cols[2]]
        })
    
    # Validate address and return results
    lat, lng, confidence = validate_address(full_address)
    return pd.Series({address_cols[0]: lat, address_cols[1]: lng, address_cols[2]: confidence})

# Validate and update pickup and receive addresses
data_copy[['pickup_lat', 'pickup_lng', 'pickup_geocode_confidence']] = data_copy.apply(update_address_data, axis=1, args=('pickup',))
data_copy[['receiving_lat', 'receiving_lng', 'receiving_geocode_confidence']] = data_copy.apply(update_address_data, axis=1, args=('receiving',))

data_copy.fillna("Unknown", inplace=True)

# Export to new csv file
output_path = 'cdw_csv_processed_2nd.csv'
data_copy.to_csv(output_path, index=False)

print(f"Updated data exported to: {output_path}")