<a href="https://colab.research.google.com/github/chiquynhdang03/Dang-Quynh-Chi/blob/main/gg_sheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Initial Environment Setup ---
# This block detects the environment and mounts Drive ONLY if in Colab.
try:
    from google.colab import drive
    COLAB_ENV = True
    print("Running in Google Colab environment.")
    drive.mount('/content/drive')
except ImportError:
    COLAB_ENV = False
    print("Not running in Google Colab environment.")

# --- Install Dependencies ---
if COLAB_ENV:
    print("Installing/Upgrading Python dependencies in Colab...")
    # !pip install pandas requests google-auth google-api-python-client urllib3 openpyxl --upgrade
    print("Dependencies check complete.")

# --- Core Imports ---
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os
from datetime import datetime, timedelta
import time
import io
import ssl

# --- Google API Imports ---
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload, MediaIoBaseDownload
from googleapiclient.errors import HttpError

# --- Constants for Data Cleaning & Processing ---
# 1. List of all Complaint Types to EXCLUDE from the data.
COMPLAINT_TYPES_TO_EXCLUDE = [
    'Adopt-A-Basket', 'Advocate - Other', 'Advocate-Co-opCondo Abatement',
    'Advocate-Prop Refunds/Credits', 'Animal Facility - No Permit', 'Appliance',
    'Beach/Pool/Sauna Complaint', 'Bench', 'Bike Rack', 'Bike Rack Condition',
    'Borough Office', 'Building Condition', 'Building Marshals office',
    'Bus Stop Shelter Placement', 'Calorie Labeling', 'Collection Truck Noise',
    'Construction Safety Enforcement', 'Cooling Tower', 'COVID-19 Non-essential Construction',
    'Dept of Investigations', 'Derelict Bicycle', 'Dirty Condition', 'Disorderly Youth',
    'Dispatched Taxi Complaint', 'DOF Parking - Tax Exemption', 'DOF Property - Owner Issue',
    'DOF Property - Payment Issue', 'DOF Property - Property Value', 'DOF Property - Reduction Issue',
    'DOF Property - Request Copy', 'DOF Property - RPIE Issue', 'DOF Property - Update Account',
    'DPR Internal', 'DRIE', 'DSNY Internal', 'Dumpster Complaint', 'Executive Inspections',
    'Facades', 'Face Covering Violation', 'Ferry Complaint', 'Ferry Inquiry',
    'For Hire Vehicle Report', 'Found Property', 'General', 'Green Taxi Complaint',
    'Green Taxi Report', 'Harboring Bees/Wasps', 'Heat/Hot Water', 'Highway Condition',
    'Highway Sign - Dangling', 'Home Delivered Meal - Missed Delivery', 'Homeless Encampment',
    'Homeless Street Condition', 'Housing - Low Income Senior', 'Housing Options',
    'Illegal Animal Kept as Pet', 'Illegal Animal Sold', 'Incorrect Data',
    'Institution Disposal Complaint', 'Internal Code', 'Lifeguard', 'Literature Request',
    'Mass Gathering Complaint', 'Miscellaneous Categories', 'Municipal Parking Facility',
    'Noise - House of Worship', 'NonCompliance with Phased Reopening', 'Oil or Gas Spill',
    'Other Enforcement', 'OUTSITE BUILDING', 'Overflowing Litter Baskets', 'Paint/Plaster',
    'Plant', 'Posting Advertisement', 'Private or Charter School Reopening',
    'Private School Vaccine Mandate Non-Compliance', 'Public Payphone Complaint',
    'Public Toilet', 'Quality of Life', 'Radioactive Material', 'Recycling Basket Complaint',
    'Recycling Enforcement', 'Retailer Complaint', 'SCRIE', 'Seasonal Collection',
    'Senior Center Complaint', 'Sewer Maintenance', 'Single Occupancy Bathroom', 'Snow',
    'Snow Removal', 'Special Operations', 'Squeegee', 'Storm', 'Sustainability Enforcement',
    'Sweeping/Inadequate', 'Sweeping/Missed', 'Tanning', 'Tattooing', 'Taxi Licensee Complaint',
    'Taxpayer Advocate Inquiry', 'Unsanitary Animal Facility', 'Unsanitary Animal Pvt Property',
    'Uprooted Stump', 'Vacant Lot', 'Vaccine Mandate Non-Compliance', 'Water Leak',
    'Water Maintenance', 'Window Guard', 'Wood Pile Remaining', 'X-Ray Machine/Equipment'
]
# 2. Dictionary to MERGE different Complaint Type names.
COMPLAINT_TYPE_MERGE_MAP = {
    'Animal-Abuse': 'Animal Abuse',
    'Derelict Vehicle': 'Derelict Vehicles',
    'Electrical': 'ELECTRIC',
    'ELEVATOR': 'Elevator',
    'Litter Basket / Request': 'Litter Basket Request',
    'PLUMBING': 'Plumbing',
    'Smoking': 'Smoking or Vaping'
}
# 3. List of Descriptors to DELETE within the 'Consumer Complaint' type.
CONSUMER_COMPLAINT_DESCRIPTORS_TO_DELETE = [
    'Retail Store', 'Sidewalk Cafe', 'Other', 'False Advertising', 'Exchange/Refund/Return',
    'Locksmith', 'Car Wash', 'Department Store or Megastore', 'Barber Shop, Beauty Salon, or Nail Salon',
    'Damaged Vehicle', 'Non-Delivery Goods/Services', 'Unlicensed', 'Car Not Available',
    'Non-Delivery of Papers', 'Furniture Store', 'Receipt Incomplete/Not Given',
    'Home Heating Oil Company', 'Auction House or Auctioneer', 'Scale Dealer/Repairer',
    'Smoking, Cigar or Vape Store', 'Moving Company', 'Secondhand Dealer', 'Bail Bond Agent',
    'Catering Establishment', 'Home Appliance Store', 'Publishing Company', 'House/Property Damaged',
    'Contract Dispute', 'Laundry', 'Wholesale Food Market', 'Jewelry Appraiser',
    'Disabled Device Dealer', 'Horse Drawn Carriage', 'Going Out of Business',
    'Door Open with Air Conditioning On', 'Laundromat', 'Gaming Cafe', 'Funeral Home',
    'Gas Station', 'Bingo Hall', 'Dealer in Products for the Disabled', 'Hardware Store',
    'Pet Store', 'High Pressure to Take on Loan/Debt', 'Debt Not Owed', 'Landlord or Real Estate Agent',
    'Jewelry Store', 'Billing Dispute', 'Documents/Paperwork Missing', 'Illegal/Unfair Booting',
    'Over Capacity', 'Price Not Posted', 'Rates Not Posted', 'Lost Property', 'Mandatory Tip',
    'Paid in Advance', 'Scale Inaccurate/Broken', 'Used Goods Dealer', 'Shipping Company',
    'Vocational or Trade School', 'Harassment', 'Damaged/Defective Goods', 'Overcharge'
]
# 4. Dictionary to MERGE Descriptors within the 'Consumer Complaint' type.
CONSUMER_COMPLAINT_DESCRIPTOR_MAP = {
    'Bodega/Deli/Supermarket': 'Bodega, Deli, or Convenience Store',
    'Garage/Parking Lot': 'Garage or Parking Lot',
    'Ticket Broker': 'Ticket Seller',
    'Car Dealer - Used': 'Used Car Dealer',
    'Hotel': 'Hotel or Motel',
    'Immigration Services': 'Immigration Services Provider',
    'Mail Order': 'Online or Mail Order',
    'Stoop Line': 'Stoop Line Stand',
    'Tour Company': 'Tour Guide',
    'Tax Preparer': 'Tax Preparation Services',
    'For-profit College': 'For-Profit College or University'
}
# 5. List of Descriptors to RE-CATEGORIZE from 'Consumer Complaint' to 'Vendor Enforcement'.
VENDOR_DESCRIPTORS_TO_RECATEGORIZE = ['Vendor', 'General Vendor', 'Street Fair Vendor']

# --- Configuration Constants ---
NYC_OPEN_DATA_RESOURCE_URL = "https://data.cityofnewyork.us/resource/erm2-nwe9.csv"
API_LIMIT_PER_REQUEST = 1000
ZIP_CODES_TO_INCLUDE = [10004, 10005, 10006, 10007, 10038, 10280, 10282, 10013, 10002]
GOOGLE_DRIVE_FOLDER_ID = "0AI4Egw2Y1IwhUk9PVA"
GSHEET_FILE_NAME = "CB1_311_Complaint_Data_GSheet"
GSHEET_MIMETYPE = "application/vnd.google-apps.spreadsheet"
GSHEET_DRIVE_FILE_NAME = "CB1_311_Complaint_Data_GSheet"
DEFAULT_INITIAL_FETCH_DATE = "2018-07-01"
PROGRESS_FILE_NAME = "last_processed_date.txt"
PROGRESS_FILE_MIMETYPE = "text/plain"

# <<< THIS IS THE FIRST KEY CHANGE >>>
# The path to the service account key is now set dynamically.
if COLAB_ENV:
    # This path is used when running in Google Colab.
    SERVICE_ACCOUNT_KEY_FILE = "/content/drive/Shareddrives/311_Complaint_Data/service_account_key.json"
else:
    # This path is used when running in GitHub Actions.
    SERVICE_ACCOUNT_KEY_FILE = "service_account_key.json"

# --- Helper Functions ---
def read_last_processed_date(drive_service) -> str:
    """Reads the last processed date from the progress file in Google Drive."""
    print("Attempting to read last processed date from Google Drive...")
    try:
        file_list_response = drive_service.files().list(
            q=f"'{GOOGLE_DRIVE_FOLDER_ID}' in parents and trashed=false and name='{PROGRESS_FILE_NAME}'",
            spaces='drive', fields='files(id, name)'
        ).execute()
        file_items = file_list_response.get('files', [])

        if file_items:
            progress_file_id = file_items[0]['id']
            print(f"Found progress file (ID: {progress_file_id}). Downloading...")
            request = drive_service.files().get_media(fileId=progress_file_id)
            file_content_bytes = io.BytesIO()
            downloader = MediaIoBaseDownload(file_content_bytes, request)
            done = False
            while not done:
                status, done = downloader.next_chunk()
            file_content_bytes.seek(0)
            date_str = file_content_bytes.getvalue().decode('utf-8').strip()
            datetime.strptime(date_str, '%Y-%m-%d')
            print(f"Resuming from last processed date: {date_str}")
            return date_str
        print(f"No progress file found. Starting from default: {DEFAULT_INITIAL_FETCH_DATE}")
        return DEFAULT_INITIAL_FETCH_DATE
    except (ValueError, HttpError, Exception) as e:
        print(f"Error reading progress file: {e}. Starting from default.")
        return DEFAULT_INITIAL_FETCH_DATE

def write_last_processed_date(date_str: str, drive_service):
    """Writes the last successfully processed date to the progress file."""
    try:
        progress_buffer = io.BytesIO(date_str.encode('utf-8'))
        file_list_response = drive_service.files().list(
            q=f"'{GOOGLE_DRIVE_FOLDER_ID}' in parents and trashed=false and name='{PROGRESS_FILE_NAME}'",
            spaces='drive', fields='files(id, name)'
        ).execute()
        file_items = file_list_response.get('files', [])

        media = MediaIoBaseUpload(progress_buffer, mimetype=PROGRESS_FILE_MIMETYPE, resumable=True)
        if file_items:
            drive_service.files().update(fileId=file_items[0]['id'], media_body=media).execute()
        else:
            file_metadata = {'name': PROGRESS_FILE_NAME, 'parents': [GOOGLE_DRIVE_FOLDER_ID]}
            drive_service.files().create(body=file_metadata, media_body=media, supportsAllDrives=True).execute()
        print(f"Progress saved: Last processed date is now {date_str}")
    except Exception as e:
        print(f"Warning: Could not save progress to Drive file: {e}")

def process_and_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Performs all detailed, multi-step data cleaning and processing locally."""
    if df.empty:
        return df
    print("\n--- Starting Comprehensive Local Data Processing ---")

    initial_rows = len(df)
    condition_to_delete = (df['complaint_type'] == 'Consumer Complaint') & \
                          (df['descriptor'].isin(CONSUMER_COMPLAINT_DESCRIPTORS_TO_DELETE))
    df = df[~condition_to_delete]
    print(f"Deleted {initial_rows - len(df)} rows based on 'Consumer Complaint' descriptor list.")

    print("Merging general complaint types...")
    df['complaint_type'] = df['complaint_type'].replace(COMPLAINT_TYPE_MERGE_MAP)

    print("Merging specific 'Consumer Complaint' descriptors...")
    consumer_mask = df['complaint_type'] == 'Consumer Complaint'
    df.loc[consumer_mask, 'descriptor'] = df.loc[consumer_mask, 'descriptor'].replace(CONSUMER_COMPLAINT_DESCRIPTOR_MAP)

    print("Re-categorizing vendor complaints from 'Consumer Complaint' to 'Vendor Enforcement'...")
    vendor_mask = (df['complaint_type'] == 'Consumer Complaint') & \
                  (df['descriptor'].isin(VENDOR_DESCRIPTORS_TO_RECATEGORIZE))
    df.loc[vendor_mask, 'complaint_type'] = 'Vendor Enforcement'

    print("Standardizing date formats...")
    date_cols = ['created_date', 'closed_date', 'due_date', 'resolution_action_updated_date']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce', utc=True)
            df[col] = df[col].dt.tz_convert('America/New_York').dt.strftime('%Y-%m-%d %H:%M:%S')

    print("--- Comprehensive Local Data Processing Complete ---\n")
    return df

def fetch_nyc_data_incremental(start_date_str: str, end_date_str: str = None) -> pd.DataFrame:
    """Fetches a broader dataset from the API filtered only by date and community board."""
    all_fetched_dfs = []
    offset = 0
    more_data_available = True

    if end_date_str:
        date_filter = f"created_date >= '{start_date_str}T00:00:00.000' AND created_date <= '{end_date_str}T23:59:59.999'"
    else:
        date_filter = f"created_date >= '{start_date_str}T00:00:00.000'"

    community_board_filter = "contains(community_board, '01 MANHATTAN')"
    where_clause = f"{date_filter} AND {community_board_filter}"
    print(f"Using simplified, robust WHERE clause:\n{where_clause}")

    session = requests.Session()
    retries = Retry(total=10, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    while more_data_available:
        params = {'$limit': API_LIMIT_PER_REQUEST, '$offset': offset, '$where': where_clause, '$order': 'created_date ASC'}
        try:
            print(f"Fetching page with offset: {offset}...")
            response = session.get(NYC_OPEN_DATA_RESOURCE_URL, params=params, timeout=90)
            response.raise_for_status()

            if not response.text.strip():
                more_data_available = False
                continue

            page_df = pd.read_csv(io.StringIO(response.text))
            if not page_df.empty:
                print(f"Successfully fetched {len(page_df)} records for the community board.")
                all_fetched_dfs.append(page_df)
                offset += len(page_df)
                if len(page_df) < API_LIMIT_PER_REQUEST:
                    more_data_available = False
            else:
                more_data_available = False
            time.sleep(1)
        except (requests.exceptions.RequestException, pd.errors.EmptyDataError) as e:
            print(f"Stopping fetch loop: {e}")
            break

    if not all_fetched_dfs:
        print("No new records for this community board were found from the API.")
        return pd.DataFrame()

    df = pd.concat(all_fetched_dfs, ignore_index=True)
    return df

# --- Main Orchestrator Function ---
def update_google_sheet_data():
    """Main function to authenticate, fetch, process, and upload data."""
    print("--- Starting Google Sheet Update Process with Yearly Chunking ---")

    # <<< THIS IS THE SECOND KEY CHANGE >>>
    # The Colab-specific drive.mount() call is no longer needed in this function.
    # The script now correctly finds the key file using the dynamic path.
    print("Authenticating with Google Drive...")
    try:
        creds = service_account.Credentials.from_service_account_file(
            SERVICE_ACCOUNT_KEY_FILE, scopes=['https://www.googleapis.com/auth/drive'])
        drive_service = build('drive', 'v3', credentials=creds)
        print("Google Drive authentication successful.")
    except Exception as e:
        print(f"FATAL: Google Drive authentication failed: {e}")
        print(f"Attempted to use key file path: {SERVICE_ACCOUNT_KEY_FILE}")
        return

    existing_df, file_id_in_drive = pd.DataFrame(), None
    try:
        file_list_response = drive_service.files().list(
            q=f"'{GOOGLE_DRIVE_FOLDER_ID}' in parents and trashed=false and name='{GSHEET_DRIVE_FILE_NAME}'",
            spaces='drive', fields='files(id, name, mimeType)').execute()
        file_items = file_list_response.get('files', [])
        if file_items:
            file_id_in_drive = file_items[0]['id']
            print(f"Found existing Google Sheet (ID: {file_id_in_drive}). Downloading...")
            request = drive_service.files().export_media(fileId=file_id_in_drive, mimeType='text/csv')
            file_content_bytes = io.BytesIO()
            downloader = MediaIoBaseDownload(file_content_bytes, request)
            done = False
            while not done: status, done = downloader.next_chunk()
            file_content_bytes.seek(0)
            existing_df = pd.read_csv(file_content_bytes)
    except (HttpError, pd.errors.EmptyDataError, Exception) as e:
        print(f"Could not load existing sheet: {e}. Starting fresh.")

    last_processed_date_str = read_last_processed_date(drive_service)
    start_date = datetime.strptime(last_processed_date_str, '%Y-%m-%d') + timedelta(days=1)
    if start_date < datetime.strptime(DEFAULT_INITIAL_FETCH_DATE, '%Y-%m-%d'):
        start_date = datetime.strptime(DEFAULT_INITIAL_FETCH_DATE, '%Y-%m-%d')
    end_date = datetime.now()

    all_chunks_dfs = []
    print("\n--- Starting Fetch Process in Yearly Chunks ---")
    for year in range(start_date.year, end_date.year + 1):
        chunk_start_date = max(start_date, datetime(year, 1, 1))
        chunk_end_date = min(end_date, datetime(year, 12, 31))
        if chunk_start_date > chunk_end_date: continue
        print(f"\n>>> Processing chunk for {year} ({chunk_start_date.strftime('%Y-%m-%d')} to {chunk_end_date.strftime('%Y-%m-%d')})")
        chunk_df = fetch_nyc_data_incremental(
            chunk_start_date.strftime('%Y-%m-%d'), chunk_end_date.strftime('%Y-%m-%d'))
        if not chunk_df.empty: all_chunks_dfs.append(chunk_df)

    if not all_chunks_dfs:
        print("\nNo new data found. Process finished."); write_last_processed_date(datetime.now().strftime('%Y-%m-%d'), drive_service); return

    new_data_df = pd.concat(all_chunks_dfs, ignore_index=True)

    # Local Filtering
    print("\n--- Starting Local Filtering with Pandas ---")
    new_data_df['incident_zip'] = pd.to_numeric(new_data_df['incident_zip'], errors='coerce')
    filtered_df = new_data_df[new_data_df['incident_zip'].isin(ZIP_CODES_TO_INCLUDE)].copy()
    print(f"ZIP Code Filter: Kept {len(filtered_df)} of {len(new_data_df)} rows.")
    initial_rows = len(filtered_df)
    filtered_df = filtered_df[~filtered_df['complaint_type'].isin(COMPLAINT_TYPES_TO_EXCLUDE)]
    print(f"Complaint Type Exclusion: Kept {len(filtered_df)} of {initial_rows} rows.")
    print("--- Local Filtering Complete ---\n")

    if filtered_df.empty:
        print("No data remains after filtering. Process finished."); write_last_processed_date(datetime.now().strftime('%Y-%m-%d'), drive_service); return

    processed_df = process_and_clean_data(filtered_df)

    if not existing_df.empty:
        processed_df = processed_df.reindex(columns=existing_df.columns, fill_value=pd.NA)
        existing_df = existing_df.reindex(columns=processed_df.columns, fill_value=pd.NA)
        combined_df = pd.concat([existing_df, processed_df], ignore_index=True)
    else:
        combined_df = processed_df

    if 'unique_key' in combined_df.columns:
        combined_df.drop_duplicates(subset=['unique_key'], inplace=True, keep='last')

    csv_buffer = io.StringIO()
    combined_df.to_csv(csv_buffer, index=False, encoding='utf-8')
    csv_buffer.seek(0)
    media = MediaIoBaseUpload(io.BytesIO(csv_buffer.getvalue().encode('utf-8')), mimetype='text/csv', resumable=True)

    try:
        if file_id_in_drive:
            drive_service.files().update(fileId=file_id_in_drive, media_body=media, supportsAllDrives=True).execute()
        else:
            file_metadata = {'name': GSHEET_FILE_NAME, 'parents': [GOOGLE_DRIVE_FOLDER_ID], 'mimeType': GSHEET_MIMETYPE}
            drive_service.files().create(body=file_metadata, media_body=media, supportsAllDrives=True, fields='id').execute()
        write_last_processed_date(datetime.now().strftime('%Y-%m-%d'), drive_service)
        print("\n--- Google Sheet Update and Upload Process Completed ---\n")
    except HttpError as e:
        print(f"FATAL: An error occurred during file upload: {e}")

# --- Main Execution Block ---
if __name__ == "__main__":
    update_google_sheet_data()

Not running in Google Colab environment.


--- Starting Google Sheet Update Process with Yearly Chunking ---
Authenticating with Google Drive...
FATAL: Google Drive authentication failed: Expecting value: line 2 column 1 (char 1)
Attempted to use key file path: service_account_key.json
