In [7]:
import os
import pandas as pd
import sqlite3

# Define the directory containing the .txt files and the SQLite database file path
directory_path = 'data/txt_files'
database_path = 'data/database.db'

# Create a connection to the SQLite database
conn = sqlite3.connect(database_path)

# Loop through each .txt file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".txt"):
        # Full path to the file
        file_path = os.path.join(directory_path, filename)
        
        # Read the .txt file into a pandas DataFrame
        df = pd.read_csv(file_path)  # Assumes default CSV delimiter (',')
        
        # Get the table name by stripping the extension from the filename
        table_name = os.path.splitext(filename)[0]
        
        # Write the DataFrame to a SQL table (replacing the table if it exists)
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        
        print(f"Created table '{table_name}' from file '{filename}'.")

# Commit changes and close the connection
conn.commit()
conn.close()

print("Database created successfully!")


Created table 'agency' from file 'agency.txt'.
Created table 'trips' from file 'trips.txt'.
Created table 'stops' from file 'stops.txt'.
Created table 'transfers' from file 'transfers.txt'.
Created table 'routes' from file 'routes.txt'.
Created table 'stop_times' from file 'stop_times.txt'.
Created table 'feed_info' from file 'feed_info.txt'.
Created table 'calendar_dates' from file 'calendar_dates.txt'.
Database created successfully!


In [9]:
import xml.etree.ElementTree as ET

# Load the XML file
tree = ET.parse('/home/julien/Downloads/export-intercites-netex-last/sncf_netexfr_20241018_2327.xml')  # Replace with the actual file path
root = tree.getroot()

# Define the namespace for NeTEx (you may need to adjust this depending on your file)
ns = {'netex': 'http://www.netex.org.uk/netex'}

# Search for a train in the XML (usually a ScheduledStopPoint or ServiceJourney element in NeTEx)
train = root.find('.//netex:ServiceJourney', ns)

# If a train is found, extract relevant information
if train is not None:
    train_id = train.attrib.get('id')
    departure_time = train.find('.//netex:DepartureTime', ns)
    destination_ref = train.find('.//netex:DestinationRef', ns)
        
    print("Attributes of the train:")
    for attr_name, attr_value in train.attrib.items():
        print(f"{attr_name}: {attr_value}")
    
    # Display the train information
    print(f"Train ID: {train_id}")
    if departure_time is not None:
        print(f"Departure Time: {departure_time.text}")
    if destination_ref is not None:
        print(f"Destination: {destination_ref.text}")
else:
    print("No train found in the NeTEx XML.")



Attributes of the train:
id: FR:ServiceJourney::SN14140FERRE_1385411
responsibilitySetRef: 1187
dataSourceRef: 2148
changed: 2024-04-30T14:01:58.789
version: any
status: active
Train ID: FR:ServiceJourney::SN14140FERRE_1385411
Departure Time: 06:27:00


In [28]:
# Set to track element types that we have already printed
seen_elements = set()

def save_element_structure_once(element, file, level=0):
    # Indentation to represent hierarchy
    indent = "  " * level
    # Check if we have already processed this type of element
    if element.tag not in seen_elements:
        # Mark this element as processed
        seen_elements.add(element.tag)
        # Write the element tag (strip the namespace if necessary)
        file.write(f"{indent}{element.tag.split('}')[-1]}\n")
        
        # Recursively process child elements
        for child in element:
            save_element_structure_once(child, file, level + 1)

# Open a file to save the structure
with open('element_structure.txt', 'w') as f:
    # Save the structure from the root element
    save_element_structure_once(root, f)

print("Element structure has been saved to 'element_structure.txt'.")

Element structure has been saved to 'element_structure.txt'.


In [1]:
import os
import xml.etree.ElementTree as ET

# Dictionary to keep track of seen elements with their full parent path and their counts
seen_elements = {}

# Function to recursively capture the structure of the XML file
def capture_element_structure(element, parent_path=""):
    tag = element.tag.split('}')[-1]  # Remove namespace if present
    full_path = f"{parent_path}/{tag}" if parent_path else tag  # Full path of the current element

    # Increment the count for this element's full path
    if full_path not in seen_elements:
        seen_elements[full_path] = 1
    else:
        seen_elements[full_path] += 1

    # Recursively capture structure for child elements, passing the full parent path
    for child in element:
        capture_element_structure(child, full_path)

# Directory containing the XML files
directory = 'data/netex_xml'  # Replace with your actual directory path

# Iterate over all XML files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        file_path = os.path.join(directory, filename)
        print(f"Processing file: {file_path}")
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Capture the structure of the XML file
        capture_element_structure(root)

# Write the element paths and their counts to a file for analysis
with open('xml_structure_with_counts.txt', 'w') as f:
    for path, count in seen_elements.items():
        f.write(f"{path}: {count}\n")

print("XML structure with counts has been saved to 'xml_structure_with_counts.txt'.")


NameError: name 'ET' is not defined

In [1]:
import xml.etree.ElementTree as ET

def extract_journey_parts(xml_file, max_examples=5):
    print("starting")
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Extract the namespace from the root tag dynamically
        ns = {'netex': root.tag[root.tag.find("{")+1:root.tag.find("}")]}
        print(f"Extracted namespace: {ns['netex']}")

        # Find JourneyPart elements in the XML file
        journey_parts = root.findall(".//netex:JourneyPart", ns)
        print(f"Found {len(journey_parts)} JourneyPart elements")

        # Limit to max_examples for demonstration purposes
        if len(journey_parts) == 0:
            print("No JourneyPart elements found. Please verify the XML structure and namespaces.")

        for i, journey_part in enumerate(journey_parts[:max_examples]):
            journey_part_id = journey_part.attrib.get('id')
            parent_journey_ref = journey_part.find('netex:ParentJourneyRef', ns)
            train_number_ref = journey_part.find('netex:TrainNumberRef', ns)
            from_stop_point_ref = journey_part.find('netex:FromStopPointRef', ns)
            to_stop_point_ref = journey_part.find('netex:ToStopPointRef', ns)
            start_time = journey_part.find('netex:StartTime', ns)
            end_time = journey_part.find('netex:EndTime', ns)

            # Use a safe check to get text or return None
            parent_journey_ref_text = parent_journey_ref.text if parent_journey_ref is not None else None
            train_number_ref_text = train_number_ref.text if train_number_ref is not None else None
            from_stop_point_ref_text = from_stop_point_ref.text if from_stop_point_ref is not None else None
            to_stop_point_ref_text = to_stop_point_ref.text if to_stop_point_ref is not None else None
            start_time_text = start_time.text if start_time is not None else None
            end_time_text = end_time.text if end_time is not None else None

            print(f"JourneyPart #{i + 1}:")
            print(f"  ID: {journey_part_id}")
            print(f"  ParentJourneyRef: {parent_journey_ref_text}")
            print(f"  TrainNumberRef: {train_number_ref_text}")
            print(f"  FromStopPointRef: {from_stop_point_ref_text}")
            print(f"  ToStopPointRef: {to_stop_point_ref_text}")
            print(f"  StartTime: {start_time_text}")
            print(f"  EndTime: {end_time_text}")
            print("-" * 40)

    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
    except Exception as e:
        print(f"Error: {e}")

# Use the function on your XML file
extract_journey_parts("data/netex_xml/sncf_netexfr_20241018_2327.xml")


Extracted namespace: http://www.netex.org.uk/netex
Found 184 JourneyPart elements
JourneyPart #1:
  ID: FR:JourneyPart:4735c17e-cea2-4076-9d80-931328e3a2ad
  ParentJourneyRef: None
  TrainNumberRef: None
  FromStopPointRef: None
  ToStopPointRef: None
  StartTime: 22:23:00
  EndTime: 05:55:00
----------------------------------------
JourneyPart #2:
  ID: FR:JourneyPart:dfbf8f94-5577-4f6d-bc58-a40c0c401c0d
  ParentJourneyRef: None
  TrainNumberRef: None
  FromStopPointRef: None
  ToStopPointRef: None
  StartTime: 22:23:00
  EndTime: 05:55:00
----------------------------------------
JourneyPart #3:
  ID: FR:JourneyPart:97cfdcc3-0151-4c90-a7bb-9ea15be0901b
  ParentJourneyRef: None
  TrainNumberRef: None
  FromStopPointRef: None
  ToStopPointRef: None
  StartTime: 22:23:00
  EndTime: 05:55:00
----------------------------------------
JourneyPart #4:
  ID: FR:JourneyPart:1f354860-4764-4287-b140-ebf17368cb18
  ParentJourneyRef: None
  TrainNumberRef: None
  FromStopPointRef: None
  ToStopPoin