In [1]:
 pip install pandas geopandas shapely tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CSV to GeoDataFrame Conversion with Geometry\n",
    "This notebook processes the Lyon public transport stops CSV file and adds geometry information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "from shapely.geometry import Point\n",
    "from tqdm.notebook import tqdm\n",
    "import os\n",
    "from pathlib import Path\n",
    "import logging\n",
    "\n",
    "# Configure logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Define file paths\n",
    "input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'\n",
    "output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'\n",
    "\n",
    "# Verify input file exists\n",
    "if not input_file.exists():\n",
    "    raise FileNotFoundError(f\"Input file not found at {input_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def read_csv_in_chunks(file_path, chunk_size=1000):\n",
    "    \"\"\"Read CSV file in chunks with error handling\"\"\"\n",
    "    try:\n",
    "        # Get total number of lines for progress bar\n",
    "        total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8')) - 1\n",
    "        \n",
    "        chunks = pd.read_csv(\n",
    "            file_path,\n",
    "            chunksize=chunk_size,\n",
    "            encoding='utf-8'\n",
    "        )\n",
    "        return chunks, total_lines\n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error reading CSV file: {e}\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def create_geometry(df):\n",
    "    \"\"\"Create geometry points from latitude and longitude\"\"\"\n",
    "    try:\n",
    "        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]\n",
    "        return geometry\n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error creating geometry: {e}\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def process_data():\n",
    "    \"\"\"Main processing function with progress tracking\"\"\"\n",
    "    try:\n",
    "        chunks, total_lines = read_csv_in_chunks(input_file)\n",
    "        processed_chunks = []\n",
    "        \n",
    "        # Create progress bar\n",
    "        pbar = tqdm(total=total_lines, desc=\"Processing rows\")\n",
    "        \n",
    "        for chunk in chunks:\n",
    "            # Validate lat/lon columns\n",
    "            if 'lat' not in chunk.columns or 'lon' not in chunk.columns:\n",
    "                raise ValueError(\"Required columns 'lat' and 'lon' not found in CSV\")\n",
    "            \n",
    "            # Create GeoDataFrame\n",
    "            geometry = create_geometry(chunk)\n",
    "            gdf = gpd.GeoDataFrame(chunk, geometry=geometry, crs=\"EPSG:4326\")\n",
    "            \n",
    "            # Convert geometry to WKT format for CSV storage\n",
    "            gdf['geometry'] = gdf['geometry'].astype(str)\n",
    "            \n",
    "            processed_chunks.append(gdf)\n",
    "            pbar.update(len(chunk))\n",
    "        \n",
    "        pbar.close()\n",
    "        \n",
    "        # Combine all chunks\n",
    "        final_df = pd.concat(processed_chunks, ignore_index=True)\n",
    "        \n",
    "        # Save to CSV\n",
    "        final_df.to_csv(output_file, index=False, encoding='utf-8')\n",
    "        logging.info(f\"Successfully saved processed data to {output_file}\")\n",
    "        \n",
    "        return final_df\n",
    "        \n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error in data processing: {e}\")\n",
    "        raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Execute the processing\n",
    "try:\n",
    "    result_df = process_data()\n",
    "    print(f\"\\nProcessing complete! File saved at: {output_file}\")\n",
    "    print(f\"\\nSample of processed data:\")\n",
    "    display(result_df.head())\n",
    "except Exception as e:\n",
    "    print(f\"An error occurred during processing: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Validation and Summary Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def display_summary_statistics(df):\n",
    "    \"\"\"Display summary statistics of the processed data\"\"\"\n",
    "    print(\"\\nSummary Statistics:\")\n",
    "    print(f\"Total number of records: {len(df)}\")\n",
    "    print(f\"Number of unique locations: {len(df.groupby(['lat', 'lon']))}\")\n",
    "    print(\"\\nColumn information:\")\n",
    "    print(df.info())\n",
    "\n",
    "try:\n",
    "    display_summary_statistics(result_df)\n",
    "except Exception as e:\n",
    "    print(f\"Error generating summary statistics: {e}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 }
}

NameError: name 'null' is not defined

In [12]:
pip install pandas geopandas shapely tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Import required libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from tqdm.notebook import tqdm
import os
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [14]:
# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

# Verify input file exists
if not input_file.exists():
    raise FileNotFoundError(f"Input file not found at {input_file}")


In [15]:
def read_csv_in_chunks(file_path, chunk_size=1000):
    """Read CSV file in chunks with error handling"""
    try:
        # Get total number of lines for progress bar
        total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8')) - 1
        
        chunks = pd.read_csv(
            file_path,
            chunksize=chunk_size,
            encoding='utf-8'
        )
        return chunks, total_lines
    except Exception as e:
        logging.error(f"Error reading CSV file: {e}")
        raise

def create_geometry(df):
    """Create geometry points from latitude and longitude"""
    try:
        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
        return geometry
    except Exception as e:
        logging.error(f"Error creating geometry: {e}")
        raise


In [16]:
def process_data():
    """Main processing function with progress tracking"""
    try:
        chunks, total_lines = read_csv_in_chunks(input_file)
        processed_chunks = []
        
        # Create progress bar
        pbar = tqdm(total=total_lines, desc="Processing rows")
        
        for chunk in chunks:
            # Validate lat/lon columns
            if 'lat' not in chunk.columns or 'lon' not in chunk.columns:
                raise ValueError("Required columns 'lat' and 'lon' not found in CSV")
            
            # Create GeoDataFrame
            geometry = create_geometry(chunk)
            gdf = gpd.GeoDataFrame(chunk, geometry=geometry, crs="EPSG:4326")
            
            # Convert geometry to WKT format for CSV storage
            gdf['geometry'] = gdf['geometry'].astype(str)
            
            processed_chunks.append(gdf)
            pbar.update(len(chunk))
        
        pbar.close()
        
        # Combine all chunks
        final_df = pd.concat(processed_chunks, ignore_index=True)
        
        # Save to CSV
        final_df.to_csv(output_file, index=False, encoding='utf-8')
        logging.info(f"Successfully saved processed data to {output_file}")
        
        return final_df
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise


In [17]:
# Execute the processing
try:
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    print(f"\nSample of processed data:")
    display(result_df.head())
except Exception as e:
    print(f"An error occurred during processing: {e}")


2024-12-04 11:52:03,411 - ERROR - Error in data processing: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Exception ignored in: <function tqdm.__del__ at 0x10ef83380>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


An error occurred during processing: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [18]:
# Import required libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from tqdm import tqdm  # Changed from tqdm.notebook
import os
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [19]:
def process_data():
    """Main processing function with progress tracking"""
    try:
        chunks, total_lines = read_csv_in_chunks(input_file)
        processed_chunks = []
        
        # Process chunks with simple progress counter
        for i, chunk in enumerate(chunks):
            # Validate lat/lon columns
            if 'lat' not in chunk.columns or 'lon' not in chunk.columns:
                raise ValueError("Required columns 'lat' and 'lon' not found in CSV")
            
            # Create GeoDataFrame
            geometry = create_geometry(chunk)
            gdf = gpd.GeoDataFrame(chunk, geometry=geometry, crs="EPSG:4326")
            
            # Convert geometry to WKT format for CSV storage
            gdf['geometry'] = gdf['geometry'].astype(str)
            
            processed_chunks.append(gdf)
            
            # Simple progress printing
            processed_rows = (i + 1) * 1000
            print(f"\rProcessing: {min(processed_rows, total_lines)}/{total_lines} rows", end="")
        
        print("\nCombining processed chunks...")
        
        # Combine all chunks
        final_df = pd.concat(processed_chunks, ignore_index=True)
        
        # Save to CSV
        print("Saving to CSV...")
        final_df.to_csv(output_file, index=False, encoding='utf-8')
        logging.info(f"Successfully saved processed data to {output_file}")
        
        return final_df
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise


In [21]:
# Execute the processing and display summary
try:
    # Process the data
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    print(f"\nSample of processed data:")
    display(result_df.head())
    
    # Display summary statistics
    print("\nSummary Statistics:")
    print(f"Total number of records: {len(result_df)}")
    print(f"Number of unique locations: {len(result_df.groupby(['lat', 'lon']))}")
    print("\nColumn information:")
    print(result_df.info())
    
    # Additional useful statistics
    print("\nMemory usage:")
    print(result_df.memory_usage(deep=True).sum() / 1024**2, "MB")
    print("\nMissing values summary:")
    print(result_df.isnull().sum())

except Exception as e:
    print(f"An error occurred: {e}")


2024-12-04 11:53:57,012 - ERROR - Error in data processing: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4



An error occurred: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4



In [22]:
def inspect_csv_file(file_path):
    """Inspect the CSV file structure"""
    try:
        # Read first few lines to detect the format
        with open(file_path, 'r', encoding='utf-8') as f:
            print("First 5 lines of the file:")
            for i, line in enumerate(f):
                if i < 5:
                    print(f"Line {i+1}: {line.strip()}")
                    print(f"Number of fields: {len(line.strip().split(','))}")
                else:
                    break
                    
        # Try to detect delimiter
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            if ';' in first_line:
                print("\nPossible semicolon delimiter detected")
            elif ',' in first_line:
                print("\nPossible comma delimiter detected")
                
        return pd.read_csv(file_path, nrows=1, encoding='utf-8').columns.tolist()
    
    except Exception as e:
        print(f"Error inspecting file: {e}")
        return None

print("Inspecting CSV file structure...")
columns = inspect_csv_file(input_file)
if columns:
    print("\nDetected columns:", columns)


Inspecting CSV file structure...
First 5 lines of the file:
Line 1: ﻿id;nom;desserte;pmr;ascenseur;escalator;gid;last_update;last_update_fme;adresse;localise_face_a_adresse;commune;insee;lon;lat
Number of fields: 1
Line 2: 46945;Fortunat - Indiennerie;J58:R;False;False;False;1953;2024-12-04 02:02:10+01:00;2024-12-04 03:20:12.248077+01:00;ROUTE DE SAINT FORTUNAT;False;Saint-Cyr-au-Mont-d'Or;69191;4,809544825633801;45,80757185491305
Number of fields: 3
Line 3: 73;Aqueducs de Beaunant;J12:A;False;False;False;2260;2024-12-04 02:02:10+01:00;2024-12-04 03:20:12.117408+01:00;133BIS AVENUE DE L AQUEDUC DE BEAUNANT;False;Sainte-Foy-lès-Lyon;69202;4,779963362674723;45,72432459823616
Number of fields: 3
Line 4: 48128;Le Four;J76:A,J76:R;False;False;False;2240;2024-12-04 02:02:10+01:00;2024-12-04 03:20:12.262558+01:00;1156 CHEMIN DU FOUR;False;Cailloux-sur-Fontaines;69033;4,876594070135879;45,856570219481455
Number of fields: 4
Line 5: 48271;Fontrobert;J133:A,J133:R,J3:A;False;False;False;2247;202

In [23]:
def read_csv_in_chunks(file_path, chunk_size=1000):
    """Read CSV file in chunks with error handling"""
    try:
        # Read with explicit semicolon delimiter
        chunks = pd.read_csv(
            file_path,
            chunksize=chunk_size,
            encoding='utf-8',
            delimiter=';',  # Explicit semicolon delimiter
            decimal=',',    # Handle European decimal format if present
            on_bad_lines='warn'
        )
        
        # Get total number of lines
        total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8')) - 1
        
        return chunks, total_lines
    except Exception as e:
        logging.error(f"Error reading CSV file: {e}")
        raise

def create_geometry(df):
    """Create geometry points from latitude and longitude"""
    try:
        # Convert coordinates to float, handling potential string formatting
        df['lon'] = pd.to_numeric(df['lon'].str.replace(',', '.'), errors='coerce')
        df['lat'] = pd.to_numeric(df['lat'].str.replace(',', '.'), errors='coerce')
        
        # Create geometry points
        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
        return geometry
    except Exception as e:
        logging.error(f"Error creating geometry: {e}")
        raise

def validate_dataframe(df):
    """Validate dataframe structure and content"""
    required_columns = ['id', 'nom', 'lat', 'lon']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Check for null values in critical columns
    null_counts = df[['lat', 'lon']].isnull().sum()
    if null_counts.any():
        logging.warning(f"Found null values in coordinates:\n{null_counts}")


In [24]:
def process_data():
    """Main processing function with progress tracking"""
    try:
        chunks, total_lines = read_csv_in_chunks(input_file)
        processed_chunks = []
        
        print(f"Processing {total_lines} rows...")
        for i, chunk in enumerate(chunks):
            # Validate chunk data
            validate_dataframe(chunk)
            
            # Create GeoDataFrame
            geometry = create_geometry(chunk)
            gdf = gpd.GeoDataFrame(chunk, geometry=geometry, crs="EPSG:4326")
            
            # Convert geometry to WKT format for CSV storage
            gdf['geometry'] = gdf['geometry'].astype(str)
            
            processed_chunks.append(gdf)
            
            # Progress
            processed_rows = (i + 1) * 1000
            print(f"\rProcessed: {min(processed_rows, total_lines)}/{total_lines} rows", end="")
        
        print("\nCombining processed chunks...")
        
        # Combine all chunks
        final_df = pd.concat(processed_chunks, ignore_index=True)
        
        # Save to CSV
        print("Saving to CSV...")
        final_df.to_csv(output_file, index=False, encoding='utf-8', sep=';')
        logging.info(f"Successfully saved processed data to {output_file}")
        
        return final_df
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise


In [25]:
# Execute the processing and display summary
try:
    # Process the data
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    
    # Display sample and statistics
    print("\nSample of processed data:")
    display(result_df.head())
    
    print("\nSummary Statistics:")
    print(f"Total number of records: {len(result_df)}")
    print(f"Number of unique locations: {len(result_df.groupby(['lat', 'lon']))}")
    
    print("\nColumn information:")
    print(result_df.info())
    
    # Display coordinate ranges
    print("\nCoordinate Ranges:")
    print("Latitude range:", result_df['lat'].min(), "to", result_df['lat'].max())
    print("Longitude range:", result_df['lon'].min(), "to", result_df['lon'].max())
    
    # Check for any remaining issues
    print("\nMissing values summary:")
    print(result_df.isnull().sum())

except Exception as e:
    print(f"An error occurred: {e}")


2024-12-04 11:55:39,834 - ERROR - Error creating geometry: Can only use .str accessor with string values!
2024-12-04 11:55:39,835 - ERROR - Error in data processing: Can only use .str accessor with string values!


Processing 2143 rows...
An error occurred: Can only use .str accessor with string values!


In [26]:
# Import required libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from tqdm.notebook import tqdm
import os
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [27]:
# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

# Verify input file exists
if not input_file.exists():
    raise FileNotFoundError(f"Input file not found at {input_file}")

In [28]:
def read_csv_in_chunks(file_path, chunk_size=1000):
    """Read CSV file in chunks with error handling"""
    try:
        # Read with explicit semicolon delimiter
        chunks = pd.read_csv(
            file_path,
            chunksize=chunk_size,
            encoding='utf-8',
            delimiter=';',  # Explicit semicolon delimiter
            decimal=',',    # Handle European decimal format if present
            on_bad_lines='warn'
        )
        
        # Get total number of lines
        total_lines = sum(1 for _ in open(file_path, 'r', encoding='utf-8')) - 1
        
        return chunks, total_lines
    except Exception as e:
        logging.error(f"Error reading CSV file: {e}")
        raise

def create_geometry(df):
    """Create geometry points from latitude and longitude"""
    try:
        # Convert coordinates to float, handling potential string formatting
        df['lon'] = pd.to_numeric(df['lon'].str.replace(',', '.'), errors='coerce')
        df['lat'] = pd.to_numeric(df['lat'].str.replace(',', '.'), errors='coerce')
        
        # Create geometry points
        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
        return geometry
    except Exception as e:
        logging.error(f"Error creating geometry: {e}")
        raise

def validate_dataframe(df):
    """Validate dataframe structure and content"""
    required_columns = ['id', 'nom', 'lat', 'lon']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Check for null values in critical columns
    null_counts = df[['lat', 'lon']].isnull().sum()
    if null_counts.any():
        logging.warning(f"Found null values in coordinates:\n{null_counts}")


In [29]:
def process_data():
    """Main processing function with progress tracking"""
    try:
        chunks, total_lines = read_csv_in_chunks(input_file)
        processed_chunks = []
        
        print(f"Processing {total_lines} rows...")
        for i, chunk in enumerate(chunks):
            # Validate chunk data
            validate_dataframe(chunk)
            
            # Create GeoDataFrame
            geometry = create_geometry(chunk)
            gdf = gpd.GeoDataFrame(chunk, geometry=geometry, crs="EPSG:4326")
            
            # Convert geometry to WKT format for CSV storage
            gdf['geometry'] = gdf['geometry'].astype(str)
            
            processed_chunks.append(gdf)
            
            # Progress
            processed_rows = (i + 1) * 1000
            print(f"\rProcessed: {min(processed_rows, total_lines)}/{total_lines} rows", end="")
        
        print("\nCombining processed chunks...")
        
        # Combine all chunks
        final_df = pd.concat(processed_chunks, ignore_index=True)
        
        # Save to CSV
        print("Saving to CSV...")
        final_df.to_csv(output_file, index=False, encoding='utf-8', sep=';')
        logging.info(f"Successfully saved processed data to {output_file}")
        
        return final_df
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise


In [30]:
# Execute the processing and display summary
try:
    # Process the data
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    
    # Display sample and statistics
    print("\nSample of processed data:")
    display(result_df.head())
    
    print("\nSummary Statistics:")
    print(f"Total number of records: {len(result_df)}")
    print(f"Number of unique locations: {len(result_df.groupby(['lat', 'lon']))}")
    
    print("\nColumn information:")
    print(result_df.info())
    
    # Display coordinate ranges
    print("\nCoordinate Ranges:")
    print("Latitude range:", result_df['lat'].min(), "to", result_df['lat'].max())
    print("Longitude range:", result_df['lon'].min(), "to", result_df['lon'].max())
    
    # Check for any remaining issues
    print("\nMissing values summary:")
    print(result_df.isnull().sum())

except Exception as e:
    print(f"An error occurred: {e}")


2024-12-04 11:57:04,004 - ERROR - Error creating geometry: Can only use .str accessor with string values!
2024-12-04 11:57:04,004 - ERROR - Error in data processing: Can only use .str accessor with string values!


Processing 2143 rows...
An error occurred: Can only use .str accessor with string values!


In [31]:
# Import required libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

# Main processing function
def process_data():
    try:
        # Read the CSV file
        print("Reading CSV file...")
        df = pd.read_csv(input_file, delimiter=';', encoding='utf-8')
        
        print(f"Processing {len(df)} rows...")
        
        # Create geometry points
        geometry = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
        
        # Create GeoDataFrame
        gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
        
        # Convert geometry to WKT format for CSV storage
        gdf['geometry'] = gdf['geometry'].astype(str)
        
        # Save to CSV
        print("Saving to CSV...")
        gdf.to_csv(output_file, index=False, encoding='utf-8', sep=';')
        
        return gdf
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

# Execute and display results
try:
    # Process the data
    print("Starting processing...")
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    
    # Display sample
    print("\nFirst 5 rows of processed data:")
    display(result_df.head())
    
    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total records: {len(result_df)}")
    print(f"Unique locations: {len(result_df.groupby(['lat', 'lon']))}")
    
    print("\nColumns in dataset:")
    for col in result_df.columns:
        print(f"- {col}")
    
    print("\nCoordinate Ranges:")
    print(f"Latitude:  {result_df['lat'].min():.6f} to {result_df['lat'].max():.6f}")
    print(f"Longitude: {result_df['lon'].min():.6f} to {result_df['lon'].max():.6f}")
    
    print("\nMemory Usage:")
    print(f"{result_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Validate data quality
    print("\nMissing Values:")
    missing = result_df.isnull().sum()
    if missing.any():
        print(missing[missing > 0])
    else:
        print("No missing values found")

except Exception as e:
    print(f"An error occurred: {e}")


2024-12-04 11:57:41,770 - ERROR - Error in data processing: could not convert string to float: np.str_('4,809544825633801')


Starting processing...
Reading CSV file...
Processing 2143 rows...
An error occurred: could not convert string to float: np.str_('4,809544825633801')


In [32]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

def validate_coordinates(df):
    """Validate coordinate data"""
    # Check if coordinates are present
    if 'lat' not in df.columns or 'lon' not in df.columns:
        raise ValueError("Missing coordinate columns (lat/lon)")
    
    # Convert to numeric if needed
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
    
    # Check for valid coordinate ranges
    valid_mask = (
        (df['lat'] >= -90) & (df['lat'] <= 90) &
        (df['lon'] >= -180) & (df['lon'] <= 180)
    )
    
    invalid_coords = (~valid_mask).sum()
    if invalid_coords > 0:
        logging.warning(f"Found {invalid_coords} rows with invalid coordinates")
    
    return df[valid_mask]

def process_data():
    try:
        # Read the CSV file
        print("Reading CSV file...")
        df = pd.read_csv(input_file, delimiter=';', encoding='utf-8')
        print(f"Found {len(df)} rows")
        
        # Validate and clean coordinates
        print("Validating coordinates...")
        df = validate_coordinates(df)
        print(f"Retained {len(df)} valid rows")
        
        # Create geometry points
        print("Creating geometry...")
        geometry = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
        
        # Create GeoDataFrame
        gdf = gpd.GeoDataFrame(
            df,
            geometry=geometry,
            crs="EPSG:4326"
        )
        
        # Convert geometry to WKT format for CSV storage
        gdf['geometry'] = gdf['geometry'].astype(str)
        
        # Save to CSV
        print("Saving to CSV...")
        gdf.to_csv(output_file, index=False, encoding='utf-8', sep=';')
        
        return gdf
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

def analyze_results(df):
    """Analyze and display results"""
    print("\nDataset Analysis:")
    print("-" * 50)
    
    print("\nBasic Statistics:")
    print(f"Total records: {len(df)}")
    print(f"Unique locations: {len(df.groupby(['lat', 'lon']))}")
    
    print("\nCoordinate Ranges:")
    print(f"Latitude:  {df['lat'].min():.6f} to {df['lat'].max():.6f}")
    print(f"Longitude: {df['lon'].min():.6f} to {df['lon'].max():.6f}")
    
    print("\nColumns in dataset:")
    for col in df.columns:
        non_null = df[col].count()
        dtype = df[col].dtype
        print(f"- {col:<20} | Type: {dtype:<10} | Non-null: {non_null:>5}")
    
    print("\nMemory Usage:")
    memory_usage = df.memory_usage(deep=True).sum() / 1024**2
    print(f"{memory_usage:.2f} MB")
    
    return df

# Execute everything
try:
    print("Starting processing...")
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    
    # Analyze results
    analyze_results(result_df)
    
    # Display sample
    print("\nSample of processed data:")
    display(result_df.head())

except Exception as e:
    print(f"An error occurred: {e}")
    raise




Starting processing...
Reading CSV file...
Found 2143 rows
Validating coordinates...
Retained 0 valid rows
Creating geometry...
Saving to CSV...

Processing complete! File saved at: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.csv

Dataset Analysis:
--------------------------------------------------

Basic Statistics:
Total records: 0
Unique locations: 0

Coordinate Ranges:
Latitude:  nan to nan
Longitude: nan to nan

Columns in dataset:
An error occurred: unsupported format string passed to numpy.dtypes.Int64DType.__format__


TypeError: unsupported format string passed to numpy.dtypes.Int64DType.__format__

In [33]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

def validate_coordinates(df):
    """Validate coordinate data"""
    # Convert to numeric if needed
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
    df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
    
    # Check for valid coordinate ranges for Lyon area
    valid_mask = (
        (df['lat'] >= 45.5) & (df['lat'] <= 46.0) &  # Lyon latitude range
        (df['lon'] >= 4.5) & (df['lon'] <= 5.0)      # Lyon longitude range
    )
    
    invalid_coords = (~valid_mask).sum()
    if invalid_coords > 0:
        logging.warning(f"Found {invalid_coords} rows outside Lyon area coordinates")
    
    return df

def process_data():
    try:
        # Read the CSV file
        print("Reading CSV file...")
        df = pd.read_csv(input_file, delimiter=';', encoding='utf-8')
        print(f"Found {len(df)} rows")
        
        # Validate and clean coordinates
        print("Validating coordinates...")
        df = validate_coordinates(df)
        print(f"Processing {len(df)} rows")
        
        # Create geometry points
        print("Creating geometry...")
        geometry = [Point(x, y) for x, y in zip(df['lon'], df['lat'])]
        
        # Create GeoDataFrame
        gdf = gpd.GeoDataFrame(
            df,
            geometry=geometry,
            crs="EPSG:4326"
        )
        
        # Convert geometry to WKT format for CSV storage
        gdf['geometry'] = gdf['geometry'].astype(str)
        
        # Save to CSV
        print("Saving to CSV...")
        gdf.to_csv(output_file, index=False, encoding='utf-8', sep=';')
        
        return gdf
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

def analyze_results(df):
    """Analyze and display results"""
    print("\nDataset Analysis:")
    print("-" * 50)
    
    print("\nBasic Statistics:")
    print(f"Total records: {len(df)}")
    print(f"Unique locations: {len(df.groupby(['lat', 'lon']))}")
    
    print("\nCoordinate Ranges:")
    print(f"Latitude:  {df['lat'].min():.6f} to {df['lat'].max():.6f}")
    print(f"Longitude: {df['lon'].min():.6f} to {df['lon'].max():.6f}")
    
    print("\nColumns in dataset:")
    for col in df.columns:
        non_null = df[col].count()
        dtype = str(df[col].dtype)  # Convert dtype to string to avoid formatting issues
        print(f"- {col:<20} | Type: {dtype:<15} | Non-null: {non_null}")
    
    print("\nMemory Usage:")
    memory_usage = df.memory_usage(deep=True).sum() / 1024**2
    print(f"{memory_usage:.2f} MB")
    
    # Add coordinate distribution information
    print("\nCoordinate Distribution:")
    print("Latitude quartiles:")
    print(df['lat'].describe())
    print("\nLongitude quartiles:")
    print(df['lon'].describe())
    
    return df

# Execute everything
try:
    print("Starting processing...")
    result_df = process_data()
    print(f"\nProcessing complete! File saved at: {output_file}")
    
    # Analyze results
    analyze_results(result_df)
    
    # Display sample
    print("\nSample of processed data:")
    print(result_df.head().to_string())

except Exception as e:
    print(f"An error occurred: {e}")
    raise




Starting processing...
Reading CSV file...
Found 2143 rows
Validating coordinates...
Processing 2143 rows
Creating geometry...
Saving to CSV...

Processing complete! File saved at: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.csv

Dataset Analysis:
--------------------------------------------------

Basic Statistics:
Total records: 2143
Unique locations: 1

Coordinate Ranges:
Latitude:  nan to nan
Longitude: nan to nan

Columns in dataset:
- id                   | Type: int64           | Non-null: 2143
- nom                  | Type: object          | Non-null: 2143
- desserte             | Type: object          | Non-null: 2143
- pmr                  | Type: bool            | Non-null: 2143
- ascenseur            | Type: bool            | Non-null: 2143
- escalator            | Type: bool            | Non-null: 2143
- gid                  | Type: int64           | Non-null: 2143
- last_update          | Type: object          | Non-null: 2143
- last_update_fme      

  gdf['geometry'] = gdf['geometry'].astype(str)


In [34]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging
import csv

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.csv'

def read_and_clean_csv(file_path):
    """Read CSV file with proper quoting and escaping"""
    try:
        # First, try to read with pandas using different settings
        df = pd.read_csv(
            file_path,
            delimiter=';',
            encoding='utf-8',
            quoting=csv.QUOTE_ALL,  # Quote all fields
            escapechar='\\',        # Use backslash as escape character
            on_bad_lines='warn'     # Warn about problematic lines
        )
        return df
    except Exception as e:
        logging.error(f"Error reading CSV with pandas: {e}")
        
        # Fallback: manually read and clean the CSV
        try:
            rows = []
            with open(file_path, 'r', encoding='utf-8') as f:
                # Read header
                header = f.readline().strip().split(';')
                
                # Read and clean each line
                for line in f:
                    # Clean the line and split by delimiter
                    cleaned_line = line.strip().replace('""', '"').split(';')
                    if len(cleaned_line) == len(header):
                        rows.append(cleaned_line)
                    else:
                        logging.warning(f"Skipping malformed line: {line}")
            
            return pd.DataFrame(rows, columns=header)
        except Exception as e2:
            logging.error(f"Error in fallback CSV reading: {e2}")
            raise

def process_data():
    try:
        # Read and clean the CSV file
        print("Reading and cleaning CSV file...")
        df = read_and_clean_csv(input_file)
        print(f"Found {len(df)} rows")
        
        # Convert coordinates to numeric values
        df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
        df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
        
        # Create geometry points
        print("Creating geometry...")
        valid_coords = df['lat'].notna() & df['lon'].notna()
        geometry = [Point(x, y) for x, y in zip(df.loc[valid_coords, 'lon'], df.loc[valid_coords, 'lat'])]
        
        # Create GeoDataFrame
        gdf = gpd.GeoDataFrame(
            df[valid_coords],
            geometry=geometry,
            crs="EPSG:4326"
        )
        
        # Save to GeoJSON (better for geometric features)
        geojson_output = output_file.with_suffix('.geojson')
        print(f"Saving to GeoJSON: {geojson_output}")
        gdf.to_file(geojson_output, driver='GeoJSON')
        
        # Also save as CSV with WKT geometry
        print(f"Saving to CSV: {output_file}")
        gdf['geometry'] = gdf['geometry'].astype(str)
        gdf.to_csv(output_file, index=False, encoding='utf-8', sep=';', quoting=csv.QUOTE_ALL)
        
        return gdf
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

def display_summary(gdf):
    """Display summary of the processed data"""
    print("\nProcessed Data Summary:")
    print("-" * 50)
    print(f"Total features: {len(gdf)}")
    print(f"Valid coordinates: {gdf.geometry.notna().sum()}")
    print("\nCoordinate bounds:")
    print(f"Latitude:  {gdf['lat'].min():.6f} to {gdf['lat'].max():.6f}")
    print(f"Longitude: {gdf['lon'].min():.6f} to {gdf['lon'].max():.6f}")
    
    print("\nOutput files created:")
    print(f"- CSV: {output_file}")
    print(f"- GeoJSON: {output_file.with_suffix('.geojson')}")

# Execute processing
try:
    print("Starting processing...")
    result_df = process_data()
    display_summary(result_df)
    
    print("\nSample of processed data:")
    print(result_df.head().to_string())

except Exception as e:
    print(f"An error occurred: {e}")
    raise


Starting processing...
Reading and cleaning CSV file...
Found 2143 rows
Creating geometry...
Saving to GeoJSON: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.geojson


2024-12-04 12:01:08,809 - INFO - Created 0 records


Saving to CSV: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.csv

Processed Data Summary:
--------------------------------------------------
Total features: 0
Valid coordinates: 0

Coordinate bounds:
Latitude:  nan to nan
Longitude: nan to nan

Output files created:
- CSV: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.csv
- GeoJSON: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.geojson

Sample of processed data:
Empty GeoDataFrame
Columns: [id, nom, desserte, pmr, ascenseur, escalator, gid, last_update, last_update_fme, adresse, localise_face_a_adresse, commune, insee, lon, lat, geometry]
Index: []


In [35]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging
import json

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.geojson'

def create_geojson(df):
    """Create a properly structured GeoJSON from DataFrame"""
    features = []
    
    for idx, row in df.iterrows():
        # Create feature properties from all columns except lat/lon
        properties = row.drop(['lat', 'lon']).to_dict()
        
        # Create the feature
        feature = {
            "type": "Feature",
            "geometry": {
                "type": "Point",
                "coordinates": [float(row['lon']), float(row['lat'])]
            },
            "properties": properties
        }
        features.append(feature)
    
    # Create the GeoJSON structure
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    return geojson

def process_data():
    try:
        # Read CSV file
        print("Reading CSV file...")
        df = pd.read_csv(
            input_file,
            delimiter=';',
            encoding='utf-8'
        )
        print(f"Found {len(df)} rows")
        
        # Convert coordinates to numeric
        df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
        df['lon'] = pd.to_numeric(df['lon'], errors='coerce')
        
        # Remove rows with invalid coordinates
        valid_coords = df['lat'].notna() & df['lon'].notna()
        df = df[valid_coords]
        
        print(f"Processing {len(df)} valid coordinates...")
        
        # Create GeoJSON
        geojson_data = create_geojson(df)
        
        # Validate GeoJSON structure
        if len(geojson_data['features']) == 0:
            raise ValueError("No valid features created")
        
        # Save GeoJSON
        print(f"Saving to GeoJSON: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(geojson_data, f, ensure_ascii=False, indent=2)
        
        # Create GeoDataFrame for analysis
        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
        gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
        
        return gdf, geojson_data
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

def validate_geojson(geojson_data):
    """Validate GeoJSON structure and content"""
    print("\nGeoJSON Validation:")
    print("-" * 50)
    print(f"Type: {geojson_data['type']}")
    print(f"Number of features: {len(geojson_data['features'])}")
    
    if len(geojson_data['features']) > 0:
        sample_feature = geojson_data['features'][0]
        print("\nSample feature structure:")
        print(f"- Feature type: {sample_feature['type']}")
        print(f"- Geometry type: {sample_feature['geometry']['type']}")
        print(f"- Number of properties: {len(sample_feature['properties'])}")
        print("\nProperty names:")
        print(", ".join(sample_feature['properties'].keys()))

# Execute processing
try:
    print("Starting processing...")
    gdf, geojson_data = process_data()
    
    # Validate and display results
    validate_geojson(geojson_data)
    
    print("\nCoordinate Ranges:")
    print(f"Latitude:  {gdf['lat'].min():.6f} to {gdf['lat'].max():.6f}")
    print(f"Longitude: {gdf['lon'].min():.6f} to {gdf['lon'].max():.6f}")
    
    print(f"\nOutput file created: {output_file}")
    print(f"File size: {output_file.stat().st_size / 1024:.1f} KB")

except Exception as e:
    print(f"An error occurred: {e}")
    raise


2024-12-04 12:02:11,475 - ERROR - Error in data processing: No valid features created


Starting processing...
Reading CSV file...
Found 2143 rows
Processing 0 valid coordinates...
An error occurred: No valid features created


ValueError: No valid features created

In [37]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import logging
import json

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define file paths
input_file = Path.home() / 'Downloads' / 'points-arret-lignes-scolaires-reseau-transports-commun-lyonnais.csv'
output_file = Path.home() / 'Downloads' / 'lyon_transport_stops_with_geometry.geojson'

def inspect_coordinates(df):
    """Inspect coordinate data for debugging"""
    print("\nCoordinate Data Inspection:")
    print("-" * 50)
    print("\nFirst 5 raw coordinate values:")
    print("Latitude values:", df['lat'].head().tolist())
    print("Longitude values:", df['lon'].head().tolist())
    print("\nData types:")
    print("Latitude:", df['lat'].dtype)
    print("Longitude:", df['lon'].dtype)

def clean_coordinates(value):
    """Clean coordinate string values"""
    if pd.isna(value):
        return None
    try:
        # Handle various string formats
        if isinstance(value, str):
            # Replace comma with dot for decimal
            value = value.replace(',', '.')
            # Remove any surrounding whitespace
            value = value.strip()
        return float(value)
    except (ValueError, TypeError):
        return None

def process_data():
    try:
        # Read CSV file
        print("Reading CSV file...")
        df = pd.read_csv(
            input_file,
            delimiter=';',
            encoding='utf-8'
        )
        print(f"Found {len(df)} rows")
        
        # Inspect raw data
        inspect_coordinates(df)
        
        # Clean and convert coordinates
        print("\nCleaning coordinates...")
        df['lat'] = df['lat'].apply(clean_coordinates)
        df['lon'] = df['lon'].apply(clean_coordinates)
        
        # Remove rows with invalid coordinates
        valid_coords = df['lat'].notna() & df['lon'].notna()
        df = df[valid_coords]
        
        print(f"Valid coordinates after cleaning: {len(df)}")
        
        if len(df) == 0:
            raise ValueError("No valid coordinates found after cleaning")
        
        # Create features list
        features = []
        for _, row in df.iterrows():
            try:
                # Create feature properties (excluding lat/lon)
                properties = {
                    col: row[col] 
                    for col in df.columns 
                    if col not in ['lat', 'lon']
                }
                
                # Create feature
                feature = {
                    "type": "Feature",
                    "geometry": {
                        "type": "Point",
                        "coordinates": [float(row['lon']), float(row['lat'])]
                    },
                    "properties": properties
                }
                features.append(feature)
            except Exception as e:
                logging.warning(f"Error creating feature: {e}")
                continue
        
        # Create GeoJSON structure
        geojson_data = {
            "type": "FeatureCollection",
            "features": features
        }
        
        # Validate features
        if len(features) == 0:
            raise ValueError("No valid features created")
        
        print(f"Successfully created {len(features)} features")
        
        # Save GeoJSON
        print(f"Saving to GeoJSON: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(geojson_data, f, ensure_ascii=False, indent=2)
        
        # Create GeoDataFrame for analysis
        geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
        gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
        
        return gdf, geojson_data
        
    except Exception as e:
        logging.error(f"Error in data processing: {e}")
        raise

def validate_geojson(geojson_data):
    """Validate GeoJSON structure and content"""
    print("\nGeoJSON Validation:")
    print("-" * 50)
    print(f"Type: {geojson_data['type']}")
    print(f"Number of features: {len(geojson_data['features'])}")
    
    if len(geojson_data['features']) > 0:
        sample_feature = geojson_data['features'][0]
        print("\nSample feature structure:")
        print(f"- Feature type: {sample_feature['type']}")
        print(f"- Geometry type: {sample_feature['geometry']['type']}")
        print(f"- Sample coordinates: {sample_feature['geometry']['coordinates']}")
        print(f"- Number of properties: {len(sample_feature['properties'])}")
        print("\nProperty names:")
        print(", ".join(sample_feature['properties'].keys()))

# Execute processing
try:
    print("Starting processing...")
    gdf, geojson_data = process_data()
    
    # Validate and display results
    validate_geojson(geojson_data)
    
    print("\nCoordinate Ranges:")
    print(f"Latitude:  {gdf['lat'].min():.6f} to {gdf['lat'].max():.6f}")
    print(f"Longitude: {gdf['lon'].min():.6f} to {gdf['lon'].max():.6f}")
    
    print(f"\nOutput file created: {output_file}")
    print(f"File size: {output_file.stat().st_size / 1024:.1f} KB")

except Exception as e:
    print(f"An error occurred: {e}")
    raise


Starting processing...
Reading CSV file...
Found 2143 rows

Coordinate Data Inspection:
--------------------------------------------------

First 5 raw coordinate values:
Latitude values: ['45,80757185491305', '45,72432459823616', '45,856570219481455', '45,675036422408105', '45,70571954877312']
Longitude values: ['4,809544825633801', '4,779963362674723', '4,876594070135879', '4,94924410028283', '4,865578537308746']

Data types:
Latitude: object
Longitude: object

Cleaning coordinates...
Valid coordinates after cleaning: 2143
Successfully created 2143 features
Saving to GeoJSON: /Users/katellguillou/Downloads/lyon_transport_stops_with_geometry.geojson

GeoJSON Validation:
--------------------------------------------------
Type: FeatureCollection
Number of features: 2143

Sample feature structure:
- Feature type: Feature
- Geometry type: Point
- Sample coordinates: [4.809544825633801, 45.80757185491305]
- Number of properties: 13

Property names:
id, nom, desserte, pmr, ascenseur, escala