Skip to content

Commit

Permalink
Merge 8cfd313 into 6a4be1f
Browse files Browse the repository at this point in the history
  • Loading branch information
maximearmstrong authored Aug 18, 2022
2 parents 6a4be1f + 8cfd313 commit bdb3361
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 42 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/acceptance_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ jobs:
- name: Run validators on queued URLs
run: |
queue="${{ matrix.data }}"
bash ./scripts/queue_runner.sh $queue
bash ./scripts/queue_runner.sh true $queue
env:
OUTPUT_BASE: ${{ github.sha }}
- name: Persist reports
Expand Down
80 changes: 47 additions & 33 deletions .github/workflows/end_to_end.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: gradle/wrapper-validation-action@v1
run-on-data:
needs: [ validate_gradle_wrapper ]
pack-snapshot:
needs: validate_gradle_wrapper
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
Expand All @@ -53,39 +53,53 @@ jobs:
uses: eskatos/gradle-command-action@v1
with:
arguments: shadowJar
#- name: Validate dataset from -- ACRONYM #<-- uncomment this line, replace ACRONYM by the name of the agency/publisher acronym
# run: java -jar cli/build/libs/gtfs-validator-*.jar --url DATASET_PUBLIC_URL --input [[[ACRONYM]]].zip --extract pathToExtractedZipContent --output validationResultDirectory #<-- uncomment this line,
#replace ACRONYM and [[[ACRONYM]]] by the agency/publisher acronym. Also replace DATASET_PUBLIC_URL by a public link to a GTFS Schedule zip archive
- name: Validate dataset from -- Greater Sydney
run: java -jar cli/build/libs/gtfs-validator-*.jar --url https://openmobilitydata.org/p/transport-for-nsw/237/latest/download --output_base output --country_code au --storage_directory gs.zip
- name: Validate dataset from -- SMART
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://transitfeeds.com/p/sonoma-marin-area-rail-transit/1050/20200930/download --output_base output --country_code us --storage_directory smart.zip
- name: Validate dataset from -- STM
run: java -jar cli/build/libs/gtfs-validator-*.jar --url https://openmobilitydata.org/p/societe-de-transport-de-montreal/39/latest/download --output_base output --country_code ca --storage_directory stm.zip
- name: Validate dataset from -- MBTA
run: java -jar cli/build/libs/gtfs-validator-*.jar --url https://cdn.mbta.com/MBTA_GTFS.zip --output_base output --country_code us --storage_directory mbta.zip
- name: Validate dataset from issue 379 -- Bay Area Rapid Transit
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://www.bart.gov/dev/schedules/google_transit.zip --output_base output --country_code us --storage_directory bart.zip
- name: Validate dataset from issue 399 -- Monterey-Salinas Transit
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://www.mst.org/google/google_transit.zip --output_base output --country_code us --storage_directory mst.zip
- name: Validate dataset from issue 398 -- Orange County Transportation Authority
run: java -jar cli/build/libs/gtfs-validator-*.jar --url https://octa.net/current/google_transit.zip --output_base output --country_code us --storage_directory octa.zip
- name: Validate dataset from issue 400 -- Siskiyou Transit and General Express
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://transitfeeds.com/p/siskiyou-transit-and-general-express/492/latest/download --output_base output --country_code us --storage_directory siskiyou.zip
- name: Validate dataset from -- AMT (Genova, Italy)
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://transitfeeds.com/p/amt-genova/1011/latest/download --output_base output --country_code it --storage_directory amtgenova.zip
- name: Validate dataset from -- Bibus (Brest, France)
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://transitfeeds.com/p/bibus/593/latest/download --output_base output --country_code fr --storage_directory bibus.zip
- name: Validate dataset from -- Metro (Christchurch, New Zealand)
run: java -jar cli/build/libs/gtfs-validator-*.jar --url http://transitfeeds.com/p/christchurch-metro/41/latest/download --output_base output --country_code nz --storage_directory metro.zip
#see https://github.com/MobilityData/gtfs-validator/pull/712#issuecomment-776110813
- name: Persist datasets
- name: Persist gtfs-validator snapshot jar
uses: actions/upload-artifact@v2
with:
name: gtfs-validator-snapshot
path: cli/build/libs/gtfs-validator-*-cli.jar
fetch-urls:
runs-on: ubuntu-latest
steps:
- name: Checkout repository code
uses: actions/checkout@v1
- name: Install dependencies
run: |
pip install -r scripts/mobility-database-harvester/requirements.txt
- name: Set URL matrix
id: set-matrix
run: |
DATASETS=$(python3 scripts/mobility-database-harvester/harvest_latest_versions.py -d scripts/mobility-database-harvester/datasets_metadata -l gtfs_latest_versions.json -s)
echo $DATASETS
echo "::set-output name=matrix::$DATASETS"
- name: Persist metadata
if: always()
uses: actions/upload-artifact@v2
with:
name: dataset_all
path: ./*.zip
name: datasets_metadata
path: scripts/mobility-database-harvester/datasets_metadata
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
run-on-data:
needs: [ fetch-urls, pack-snapshot ]
runs-on: ubuntu-latest
strategy:
matrix: ${{ fromJson(needs.fetch-urls.outputs.matrix) }}
steps:
- uses: actions/checkout@v1
- name: Download latest changes .jar file from previous job
uses: actions/download-artifact@v2
with:
name: gtfs-validator-snapshot
path: gtfs-validator-snapshot
- name: Run snapshot validator on queued URLs
run: |
queue="${{ matrix.data }}"
bash ./scripts/queue_runner.sh false $queue
env:
OUTPUT_BASE: ${{ github.sha }}
- name: Persist reports
uses: actions/upload-artifact@v2
with:
name: validation_report_all
path: output
name: reports_snapshot
path: ${{ github.sha }}/output
34 changes: 28 additions & 6 deletions scripts/mobility-database-harvester/harvest_latest_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import numpy as np
import os
from os import path
import random

#####################################################################################
# This script harvests the latest dataset versions on the Mobility Database Catalogs.
Expand All @@ -27,6 +28,8 @@
LATEST_URL = "urls.latest"
DATA_TYPE = "data_type"
GTFS = "gtfs"
AUTHENTICATION_TYPE = "urls.authentication_type"
MDB_SOURCE_ID = "mdb_source_id"

# Sources to exclude because they are too big for the workflow.
SOURCES_TO_EXCLUDE = ["de-unknown-rursee-schifffahrt-kg-gtfs-784"]
Expand All @@ -44,6 +47,10 @@
DATA = "data"
ID = "id"

# Sampling constants

sampling_ratio = 0.05


def save_content_to_file(content, data_path, filename):
"""Saves content to JSON file.
Expand All @@ -56,22 +63,30 @@ def save_content_to_file(content, data_path, filename):
json.dump(content, f)


def harvest_latest_versions():
def harvest_latest_versions(to_sample):
"""Harvests the latest URLs from the Mobility Database catalogs.
:param to_sample: Boolean flag. Sample the sources in the CSV if True.
:return: The dictionary of the latest URLs with the format {Name: Url}.
"""
catalogs = pd.read_csv(CATALOGS_CSV)
latest_versions = {}

latest_urls = catalogs[LATEST_URL].loc[catalogs[DATA_TYPE] == GTFS]
for index, value in latest_urls.items():
latest_url = value
catalogs_gtfs = catalogs[catalogs[DATA_TYPE] == GTFS]

if to_sample:
catalogs_gtfs = catalogs_gtfs[~catalogs_gtfs[AUTHENTICATION_TYPE].isin([1, 2])]
n_sampling = int(len(catalogs_gtfs) * sampling_ratio)
samples = random.sample(catalogs_gtfs[MDB_SOURCE_ID].tolist(), n_sampling)
catalogs_gtfs = catalogs_gtfs[catalogs_gtfs[MDB_SOURCE_ID].isin(samples)]

for index, latest_url in catalogs_gtfs[LATEST_URL].items():
source_file_name = latest_url.replace(URL_PREFIX, "").replace(URL_SUFFIX, "")
latest_versions[source_file_name] = latest_url

# Some sources/datasets are too big for the workflow so we are excluding them.
for source in SOURCES_TO_EXCLUDE:
del latest_versions[source]
if source in latest_versions:
del latest_versions[source]

return latest_versions

Expand Down Expand Up @@ -115,17 +130,24 @@ def apply_github_matrix_formatting(latest_urls):
default=".",
help="Data path.",
)
parser.add_argument(
"-s",
"--sample",
action="store_true",
help="Boolean flag to sample or not the data.",
)
args = parser.parse_args()

latest_versions_file = args.latest_versions_file
data_path = args.data_path
to_sample = args.sample

if not path.isdir(data_path) and path.exists(data_path):
raise Exception("Data path must be a directory if existing.")
elif not path.isdir(data_path):
os.mkdir(data_path)

latest_versions = harvest_latest_versions()
latest_versions = harvest_latest_versions(to_sample)
# We save the latest versions as a JSON file because it is used later in the "compare-outputs" job of the workflow.
save_content_to_file(
latest_versions,
Expand Down
8 changes: 6 additions & 2 deletions scripts/queue_runner.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
closing_curly_bracket="}"
raw_queue_string=$*
master="$1"
raw_queue_string="${@:2}"
IFS=" " read -a queue <<< $raw_queue_string
for item in "${queue[@]}"
do
Expand All @@ -14,6 +15,9 @@ do
URL=$(jq '.url' <<< "$item")
path_name=${ID//\"/}
java -Xmx10G -Xms8G -jar gtfs-validator-snapshot/gtfs-validator*.jar --url $URL --output_base $OUTPUT_BASE/output/$path_name --validation_report_name latest.json --system_errors_report_name latest_errors.json
java -Xmx10G -Xms8G -jar gtfs-validator-master/gtfs-validator*.jar --url $URL --output_base $OUTPUT_BASE/output/$path_name --validation_report_name reference.json --system_errors_report_name reference_errors.json
if [ "$master" = "true" ];
then
java -Xmx10G -Xms8G -jar gtfs-validator-master/gtfs-validator*.jar --url $URL --output_base $OUTPUT_BASE/output/$path_name --validation_report_name reference.json --system_errors_report_name reference_errors.json
fi;
wait
done

0 comments on commit bdb3361

Please sign in to comment.