In [1]:
import zipfile
import os
import geopandas as gpd

# File path to the ZIP file containing the train station shapefile
zip_file_path = '/home/Daniel Bi/project two/data/landing/PTV.zip'
extract_dir = '/home/Daniel Bi/project two/data/landing/PTV_train_station'

# Step 1: Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
    print(f"Files extracted to: {extract_dir}")

# Step 2: List the files in the 'PTV' subdirectory to check for the correct shapefile name
ptv_subdir = os.path.join(extract_dir, 'PTV')
extracted_files = os.listdir(ptv_subdir)
print("Extracted files in PTV folder:", extracted_files)

# Step 3: Find the shapefile (.shp) in the 'PTV' folder
shapefile_name = [file for file in extracted_files if file.endswith('.shp')]
if shapefile_name:
    shapefile_path = os.path.join(ptv_subdir, shapefile_name[0])
    print(f"Shapefile found: {shapefile_path}")

    # Step 4: Load the train station shapefile
    try:
        train_station_gdf = gpd.read_file(shapefile_path)
        print("Train station shapefile loaded successfully.")
        print(train_station_gdf.head())
    except Exception as e:
        print(f"Error loading train station shapefile: {e}")
else:
    print("No shapefile found in the 'PTV' folder.")

Files extracted to: /home/Daniel Bi/project two/data/landing/PTV_train_station
Extracted files in PTV folder: ['ptv_metro_train_station_column_names.txt', 'PTV_METRO_TRAIN_STATION.prj', 'PTV_METRO_TRAIN_STATION.dbf', 'PTV_METRO_TRAIN_STATION.shp', 'PTV_METRO_TRAIN_STATION.shx', 'PTV_METRO_TRAIN_STATION.cpg']
Shapefile found: /home/Daniel Bi/project two/data/landing/PTV_train_station/PTV/PTV_METRO_TRAIN_STATION.shp
Train station shapefile loaded successfully.
  STOP_ID   LATITUDE                                          STOP_NAME  \
0   19970 -37.781193             Royal Park Railway Station (Parkville)   
1   19971 -37.788140  Flemington Bridge Railway Station (North Melbo...   
2   19972 -37.794267         Macaulay Railway Station (North Melbourne)   
3   19973 -37.807419   North Melbourne Railway Station (West Melbourne)   
4   19974 -37.788657        Clifton Hill Railway Station (Clifton Hill)   

    LONGITUDE TICKETZONE                                          ROUTEUSSP  \
0  144.

Geocode to convert addresses to latitude and longtitude

In [None]:
from geopy.geocoders import Nominatim
import pandas as pd
import time
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# File path for the rental data CSV
rental_data_path = '/home/Daniel Bi/project two/data/landing/rental_scrape.csv'

# Step 1: Load the rental data CSV
rental_df = pd.read_csv(rental_data_path)

# Step 2: Initialize Nominatim geocoder with retry logic
geolocator = Nominatim(user_agent="rental_geocoder", timeout=10)

# Create a session with retry settings to handle temporary errors
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retries)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Step 3: Function to geocode addresses with retries
def geocode_address(address):
    try:
        location = geolocator.geocode(address, timeout=10)
        if location:
            return (location.latitude, location.longitude)
        else:
            return (None, None)
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return (None, None)

# Step 4: Apply geocoding function to the 'Address' column with retry logic
rental_df['coordinates'] = rental_df['Address'].apply(lambda address: geocode_address(address))

# Step 5: Split coordinates into 'latitude' and 'longitude' columns
rental_df['latitude'] = rental_df['coordinates'].apply(lambda x: x[0])
rental_df['longitude'] = rental_df['coordinates'].apply(lambda x: x[1])

# Step 6: Save the DataFrame with geocoded coordinates to a new CSV
rental_df.to_csv('/home/Daniel Bi/project two/data/landing/rental_with_coordinates.csv', index=False)

# Optional: Display the first few rows to verify
print(rental_df[['Address', 'latitude', 'longitude']].head())

# Add a small delay between requests to avoid being blocked by Nominatim API
time.sleep(1)

Calculate distance to cloest train station

In [3]:
import googlemaps
import pandas as pd
import numpy as np
from geopy.distance import geodesic
import geopandas as gpd
import zipfile
import os

# Initialize the Google Maps API client with your API key
gmaps = googlemaps.Client(key='AIzaSyDO3Op75ZC9rzjZN-HyFbFveFOzPglOJtc')

# File paths
rental_data_path = '/home/Daniel Bi/project two/data/landing/rental_with_cbd_distances.csv'
zip_file_path = '/home/Daniel Bi/project two/data/landing/PTV.zip'
extract_dir = '/home/Daniel Bi/project two/data/landing/PTV_train_station'

# Step 1: Extract the ZIP file containing the train station shapefile
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
    print(f"Files extracted to: {extract_dir}")

# Step 2: Load the shapefile (.shp) from the extracted files
ptv_subdir = os.path.join(extract_dir, 'PTV')
shapefile_name = [file for file in os.listdir(ptv_subdir) if file.endswith('.shp')]

if shapefile_name:
    shapefile_path = os.path.join(ptv_subdir, shapefile_name[0])
    print(f"Shapefile found: {shapefile_path}")

    # Step 3: Load the train station shapefile and reproject to WGS84 (lat/lon)
    try:
        train_station_gdf = gpd.read_file(shapefile_path)
        train_station_gdf = train_station_gdf.to_crs(epsg=4326)  # Reproject to WGS84 (EPSG:4326)
        print("Train station shapefile loaded and reprojected successfully.")
    except Exception as e:
        print(f"Error loading train station shapefile: {e}")
else:
    print("No shapefile found in the 'PTV' folder.")

# Step 4: Prepare train station data for distance calculation
train_station_df = train_station_gdf[['STOP_NAME', 'geometry']].copy()
train_station_df['latitude'] = train_station_df['geometry'].y
train_station_df['longitude'] = train_station_df['geometry'].x
train_station_df_clean = train_station_df[['STOP_NAME', 'latitude', 'longitude']].dropna()

# Step 5: Load the rental data CSV (with CBD distances already calculated)
rental_df = pd.read_csv(rental_data_path)

# Filter out rows where latitude or longitude is NaN for rental data
rental_df_clean = rental_df.dropna(subset=['latitude', 'longitude']).copy()

# Initialize the new columns for storing distances to the closest train station
rental_df_clean['closest_train_station'] = None
rental_df_clean['straight_line_distance_km'] = None
rental_df_clean['route_distance_to_closest_train_km'] = None

# Step 6: Calculate the straight-line (Haversine) distance
def calculate_closest_station(property_coords, stations_df):
    min_distance = float('inf')
    closest_station = None
    
    for _, station in stations_df.iterrows():
        station_coords = (station['latitude'], station['longitude'])
        distance = geodesic(property_coords, station_coords).kilometers  # Calculate straight-line distance
        
        if distance < min_distance:
            min_distance = distance
            closest_station = station['STOP_NAME']
    
    return closest_station, min_distance

# Step 7: Calculate the driving route distance using Google Maps API
def calculate_route_distance(property_coords, station_coords, gmaps_client):
    try:
        # Request the driving distance between the property and the closest train station
        result = gmaps_client.distance_matrix(origins=[property_coords], destinations=[station_coords], mode="driving")
        
        # Check if the result is valid
        if result['rows'][0]['elements'][0]['status'] == 'OK':
            distance = result['rows'][0]['elements'][0]['distance']['value']  # Distance in meters
            return distance / 1000  # Convert from meters to kilometers
        else:
            print(f"No valid route distance found for {property_coords} to station: {result['rows'][0]['elements'][0]['status']}")
            return None
    except Exception as e:
        print(f"Error calculating route distance for {property_coords}: {e}")
        return None

# Step 8: Process each property to calculate the closest train station and route distance
rental_df_clean['coordinates'] = rental_df_clean.apply(lambda row: (row['latitude'], row['longitude']), axis=1)

for idx, row in rental_df_clean.iterrows():
    if pd.isnull(row['route_distance_to_closest_train_km']):  # Only process rows with null distances
        coords = row['coordinates']
        
        # Step 8a: Find the closest train station using straight-line distance
        closest_station, straight_line_distance = calculate_closest_station(coords, train_station_df_clean)
        rental_df_clean.loc[idx, 'closest_train_station'] = closest_station
        rental_df_clean.loc[idx, 'straight_line_distance_km'] = straight_line_distance
        
        # Get the coordinates of the closest station
        station_coords = train_station_df_clean[train_station_df_clean['STOP_NAME'] == closest_station][['latitude', 'longitude']].values[0]
        
        # Step 8b: Calculate the driving route distance to the closest train station using Google Maps API
        route_distance = calculate_route_distance(coords, station_coords, gmaps)
        rental_df_clean.loc[idx, 'route_distance_to_closest_train_km'] = route_distance

    # Save progress every 100 rows
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1} rows, saving progress...")
        rental_df_clean.to_csv(rental_data_path, index=False)

# Final save after processing all data
rental_df_clean.to_csv(rental_data_path, index=False)
print("Processing completed.")



Files extracted to: /home/Daniel Bi/project two/data/landing/PTV_train_station
Shapefile found: /home/Daniel Bi/project two/data/landing/PTV_train_station/PTV/PTV_METRO_TRAIN_STATION.shp
Train station shapefile loaded and reprojected successfully.
Processed 100 rows, saving progress...
Processed 200 rows, saving progress...
Processed 300 rows, saving progress...
Processed 400 rows, saving progress...
Processed 500 rows, saving progress...
Processed 600 rows, saving progress...
Processed 700 rows, saving progress...
Processed 800 rows, saving progress...
Processed 900 rows, saving progress...
Processed 1000 rows, saving progress...
Processed 1100 rows, saving progress...
Processed 1200 rows, saving progress...
Processed 1300 rows, saving progress...
Processed 1400 rows, saving progress...
Processed 1500 rows, saving progress...
Processed 1600 rows, saving progress...
Processed 1700 rows, saving progress...
Processed 1800 rows, saving progress...
Processed 1900 rows, saving progress...
P