In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from google.cloud import storage
import gzip
from io import BytesIO

In [None]:
Plans2 = pd.read_csv( 'gs://beam-core-outputs/sfbay-baseline-20230526/beam/year-2020-iteration-4/ITERS/it.0/0.plans.csv.gz')

columns_to_read = ['IDMerged', 'actStartType', 'actEndType', 'duration_travelling', 'distance_travelling', 'actStartTime', 'actEndTime',
                   'startX','startY','endX','endY']

Inexus2 = pd.read_csv( 'gs://beam-core-outputs/sfbay-baseline-20230526/inexus/sfbay_baseline_default-1.0_2020__20230526.csv.gz', 
                     nrows = None, usecols=columns_to_read)
#nrows for a smaller sample



In [None]:
import pandas as pd
from lxml import etree
import numpy as np
from xml.dom import minidom
import xml.etree.ElementTree as ET
import pandas as pd
from pyproj import Transformer

# Plans2 = Plans[Plans['planSelected'] == True]

Inexus2 = Inexus2.rename(columns={'IDMerged': 'person_id', 'actStartType': 'activityTypeFrom', 'actEndType': 'activityTypeTo'})
Inexus2['planIndex_inexus'] = Inexus2.groupby('person_id').cumcount() * 2 + 1
#chek act type

df_final_merged = pd.merge(Inexus2,Plans2, left_on=['person_id', 'planIndex_inexus'], right_on=['personId', 'planElementIndex'], how='left')

df_final_merged = df_final_merged.sort_values(['person_id','planIndex_inexus'], ascending=True)  # Use ascending=False for descending order.

print(len(df_final_merged))
#Filter outliers
df_final_merged = df_final_merged.dropna(subset=['planElementIndex'])
print(len(df_final_merged))

df_final_merged['next_person'] = df_final_merged['person_id'].shift(-1)
df_final_merged['pre_start'] = df_final_merged['actStartTime'].shift(+1)

In [None]:
def safe_str(obj):
    """Returns an empty string for None, nan and similar objects, otherwise str(obj)."""
    return '' if pd.isnull(obj) else str(obj)

transformer = Transformer.from_crs('epsg:4326', 'epsg:26910', always_xy=True)

def convert_coords(lat, lon):
    return transformer.transform(lon, lat)

def seconds_to_hh_mm_ss(seconds):
    # Convert seconds to hours, minutes, and seconds
    hours = seconds // 3600  # 3600 seconds in an hour
    minutes = (seconds % 3600) // 60  # 60 seconds in a minute
    seconds = seconds % 60

    # Format the time to have two digits for hours, minutes, and seconds
    return f"{hours:02}:{minutes:02}:{seconds:02}"

def convert_csv_to_xml(df_final_merged, xml_file_path):
    # Create the root element of the XML
    root = ET.Element("people")
    
    person_pre = -1
    score_pre = -1
    # Iterate over the CSV rows to create the XML structure
    
    row_count = 0  # Initialize a counter
    total_rows = len(df_final_merged)  # Get the total number of rows
    
    for _, row in df_final_merged.iterrows():
        
        row_count += 1  # Increment the counter
        if row_count%50000 == 0:
            print(f"Processing row {row_count}/{total_rows}")  # Print the progress

        
        next_person = row['next_person']
        # Create the 'person' element
        if row['person_id'] != person_pre:
            if pd.notna(row.get('person_id')):
                person = ET.SubElement(root, "person", id=safe_str(int(row['person_id'])))
            else:
                person = ET.SubElement(root, "person", id='')

        
        # Ensure that 'planSelected' is a string 'true' or 'false', not a Python boolean
        plan_selected_str = "yes" if row['planSelected'] else "false"
        
        # Create the 'plan' element
        if row['planScore'] != score_pre:
            plan = ET.SubElement(person, "plan", score=safe_str(row['planScore']), selected=plan_selected_str)
        
###
        # Create 'activity' elements for start and end
        if row['person_id'] != person_pre:
            utm_x, utm_y = convert_coords(row['startY'], row['startX'])

            activity = ET.SubElement(plan, "activity", 
                                     type=safe_str(row['activityTypeTo']), 
                                     x=safe_str(utm_x), 
                                     y=safe_str(utm_y))

            # Add 'end_time' attribute if it exists and is not NaN

            if pd.notna(row.get('actEndTime')):
                activity.set('end_time', safe_str(seconds_to_hh_mm_ss(int(row['actEndTime']))))

            activity.text = ' ' 
        else:
            utm_x, utm_y = convert_coords(row['startY'], row['startX'])
            activity = ET.SubElement(plan, "activity", 
                                     type=safe_str(row['activityTypeTo']), 
                                     x=safe_str(utm_x), 
                                     y=safe_str(utm_y))

            # Add 'end_time' attribute if it exists and is not NaN
            if pd.notna(row.get('pre_start')):
                activity.set('start_time', safe_str(seconds_to_hh_mm_ss(int(row['pre_start']))))
            if pd.notna(row.get('actEndTime')):
                activity.set('end_time', safe_str(seconds_to_hh_mm_ss(int(row['actEndTime']))))

            activity.text = ' ' 

###
        # Create 'leg' element
    
        leg = ET.SubElement(plan, "leg", mode=safe_str(row['legMode']), 
                            dep_time = safe_str(seconds_to_hh_mm_ss(int(row['actEndTime']))),
                            trav_time=safe_str(seconds_to_hh_mm_ss(int(row['duration_travelling']))))
                            # dep_time=row['dep_time'], 
                            
        
        # Create 'route' element
        if  pd.notna(row.get('legRouteStartLink')):
            legRouteStartLink = int(row['legRouteStartLink'])
        else:
            legRouteStartLink = ''
        if  pd.notna(row.get('legRouteEndLink')):
            legRouteEndLink = int(row['legRouteEndLink'])
        else:
            legRouteEndLink = ''
        if  pd.notna(row.get('distance_travelling')):
            distance_travelling = int(row['distance_travelling'])
        else:
            distance_travelling = ''
        if  pd.notna(row.get('duration_travelling')):
            duration_travelling = int(row['duration_travelling'])
        else:
            duration_travelling = ''
        route = ET.SubElement(leg, "route", 
                              type=safe_str(row['legRouteType']), 
                              start_link=safe_str(legRouteStartLink), 
                              end_link=safe_str(legRouteEndLink), 
                              distance=safe_str(distance_travelling),
                              trav_time=safe_str(seconds_to_hh_mm_ss(duration_travelling)))
        route.text = ' '.join(safe_str(row['legRouteLinks']).split(';'))   # Ensure that 'route_links' is converted to string
                    
###
#         # Create 'activity' elements for start and end
        if next_person != row['person_id']:
            activity = ET.SubElement(plan, "activity", 
                                     type=safe_str(row['activityTypeFrom']), 
                                     x=safe_str(row['endX']), 
                                     y=safe_str(row['endY']))

            # Add 'end_time' attribute if it exists and is not NaN
            if pd.notna(row.get('actStartTime')):
                activity.set('start_time', safe_str(seconds_to_hh_mm_ss(int(row['actStartTime']))))
            # if pd.notna(row.get('actEndTime')):
            #     activity.set('end_time', safe_str(int(row['actEndTime'])))

    
            activity.text = ' '
        
        
        person_pre = row['person_id']
        score_pre = row['planScore']

    xml_str = ET.tostring(root, encoding='utf-8', method='xml')
    parsed_xml = minidom.parseString(xml_str)
    pretty_xml_str = parsed_xml.toprettyxml(indent="   ", newl="\n", encoding='UTF-8')
    
    with open(xml_file_path, "wb") as f:  # Note that we open the file in binary mode
        f.write(pretty_xml_str)
        
    return xml_file_path

xml_file_path = 'outputs/output_file.xml'

convert_csv_to_xml(df_final_merged, xml_file_path)