Skip to content


Subversion checkout URL

You can clone with
Download ZIP
Browse files

Initial commit

  • Loading branch information...
commit e86bfa9f50716b996641ea26a2177ebbb163fb40 0 parents
@Motiejus authored
@@ -0,0 +1,55 @@
+### Rynanair crawler that uses Webkit as backend ###
+Works as of 2011-01-08. Then I found Azuon, which
+made my work unnecessary.
+Beware, this thing does not show good coding practices. It is more a working prototype.
+### Usage ###
+## For single instance ##
+ryanaid.jar is a java runnable. You will need some kind of JRE.
+It takes environment variables as parameters:
+date_from="2010-03-11" # Start checking at this date
+check_days=90 # Finish at ~ may 11'th
+from="KUN" # Airport code, Kaunas in this case
+to="EDI" # Edinburgh
+wait_ms=5000 # Wait this number of ms after every request (click). Useful if you do not want to be banned from the ryanair website.
+socks_host= # Optional. Connect through this place
+socks_port=1080 # Self explanatory if you know what SOCKS is
+All variables have default values, so you can just launch it like this:
+$ from="KUN" to="HHN" java -jar ryanaid.jar
+You may have firebug installed. Check source for details.
+## For multiple firefox instances (real automatic crawling) ##
+Here are helpers that help to crawl all the ryanair. You will need:
+* (da|a|ba|c)sh (a _shell_)
+* autossh (for reliable socks proxies)
+* crawler_key and, which are rsa/dsa ssh key pairs (to connect to your reliable socks-via-ssh proxies)
+Basically your crawling speed is limited by how many hosts and RAM you have.
+9 crawlers were working full-time, used ~2.5 GiB RAM and ~150kBps bandwidth as average (90% - download).
+Fetched all ryanair flights (now-3 months) approximately once a day.
+$ ssh-keygen -t rsa <ENTER><ENTER>
+$ mv ~/.ssh/id_rsa crawler_key
+$ mv ~/.ssh/
+$ ./download_dests > dests.txt # Renew destination pairs
+## Fill all the hosts you can get access to to the tunnel_hosts.txt. Example:
+## Upload your key to host so they you connect using your private key (
+$ ./go_1crawler 1 &
+$ ./go_1crawler 2 &
+$ ./go_1crawler 3 &
+and so on, repeat for each host you have in your tunnel_hosts.txt.
+actiondir/ is logs and pids of autossh
+datadir/ is the thing you want to process.
+Sorry for writing style. Patches for this README are very welcome :)
2  actiondir/.gitignore
@@ -0,0 +1,2 @@
0  crawler_key
No changes.
0 
No changes.
2  datadir/.gitignore
@@ -0,0 +1,2 @@
2,378 dests.txt
@@ -0,0 +1,2378 @@
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+ import json # 2.6 and above
+except ImportError:
+ import simplejson as json # 2.5 and below
+import httplib, re, logging
+def get_dests(logger=None):
+ if logger is None:
+ logging.basicConfig(level=logging.DEBUG)
+ logger = logging.getLogger('RyanairDest')
+ conn = httplib.HTTPConnection('')
+ logger.debug("Sending request for destinations array")
+ conn.request("GET", '/en/booking/form')
+ resp = conn.getresponse()
+"Request success, response code: %d" % resp.status)
+ if (resp.status != 200):
+ raise RyanairException("Invalid return status from Ryanair. \
+ Expected: 200, got: %d" % resp.status)
+ html =
+ airport_part ='var Dests=.*\s*([\S\s]*?);', html)
+ if airport_part is None:
+ raise RyanairException("Parsing of ryanair airports changed.")
+ airport_arr = re.split(",\n",
+"%d airports found" % int(len(airport_arr)/2))
+ airport_dirs, airport_names = {}, {}
+ for it in airport_arr:
+ # Example: sKUN='Kaunas'
+ if it[0] == 's':
+ airport_names[it[1:4]] = it[6:-1]
+ # Example line: aKUN=aKUN=['STN','BGY','RYG','BVA','TMP']
+ else:
+ short, dirs = it[1:4], it[5:]
+ # Expand dirs (this looks like json array, but improper)
+ airport_dirs[short] = json.loads(dirs.replace("'", '"'))
+ return airport_names, airport_dirs
+if __name__ == "__main__":
+ logging.basicConfig(level=logging.ERROR)
+ logger = logging.getLogger('RyanairDest')
+ names, dests = get_dests(logger)
+ for k, v in dests.iteritems():
+ for dest in v:
+ print ("%s-%s" % (k, dest) )
126 go_1crawler
@@ -0,0 +1,126 @@
+export WHOAMI=`basename "$0"`
+function usage() {
+ echo -n "Enter one numeric agument between 1 and "
+ wc -l < tunnel_hosts.txt
+ exit 1
+function log() {
+ echo `date "+%F+%T" | tr -d '\n'` $WHOAMI[$$] "$1" >> $LOGFILE
+function fail() {
+ if [ -n "$1" ]; then
+ log "----------------------------------------------------------"\
+ "$1" \
+ "----------------------------------------------------------"
+ fi
+ if [ -n "$DEST" ]; then # Adding the current dest for processing
+ ./next_dest "$DEST"
+ fi
+ if [ -f "$AUTOSSH_PIDFILE" ]; then
+ log "Attempting to kill $AUTOSSH_PIDFILE"
+ kill `cat $AUTOSSH_PIDFILE`
+ fi
+ echo "$1" >> $ACTIONBASEDIR/fail-$NUMBER.log
+ if [ -n "$FBASE" ]; then
+ touch "$"
+ fi
+ exit 1
+function proc_params() {
+ [[ -n "$(echo "$NUMBER" | grep -E "^[0-9]+$")" ]] || usage
+ SOX="$(sed -n "${NUMBER}p" tunnel_hosts.txt)"
+ [[ -n "$SOX" ]] || usage
+ export HOST="$(echo $SOX | cut -d: -f1)"
+ export PORT="$(echo $SOX | cut -d: -f2)"
+ export SPORT=$((10000+$NUMBER)) # Socks port
+ export CPORT=$((22000+$NUMBER*29-5)) # Check port
+function check_if_sport_used() {
+ if [ -n "`lsof -i TCP:${SPORT}`" ]; then
+ fail "Port ${SPORT} is busy:\n`lsof -i TCP:${SPORT}`"
+ fi
+function start_autossh() {
+ autossh -M${CPORT} -f -ND${SPORT} -i${HERE}/crawler_key -p${PORT} ${HOST}
+ if [ "$RETCODE" != "0" ]; then
+ fail "General autossh fail (ret code = $RETCODE).\n"\
+ "Please have a look at $AUTOSSH_LOGFILE"
+ fi
+function check_autossh_pid() {
+ if [ ! -f "$AUTOSSH_PIDFILE" ]; then
+ fail "Autossh pidfile '${AUTOSSH_PIDFILE}' not found"
+ fi
+function check_if_display_ok() {
+ if [ -z "$DISPLAY" ]; then
+ fail "Bad display: $DISPLAY"
+ fi
+[[ "$#" = 1 ]] || usage
+while true; do
+ check_autossh_pid
+ export DEST="$(./next_dest)"
+ TIME="$(date '+%F+%T')"
+ DATEHOUR=$(echo $TIME | perl -pe 's/(.*)\+(\d+).*/\1_\2/') # 2010-01-01_13
+ mkdir -p "${DATADIR}/${DATEHOUR}"
+ touch "${FBASE}.started"
+ from="`echo $DEST | cut -d- -f1`"
+ to="`echo $DEST | cut -d- -f2`"
+ check_days=180
+ socks_host=""
+ socks_port="$SPORT"
+ export from to check_days socks_host socks_port DISPLAY
+ java -jar ryanaid.jar > "$" 2> "$FBASE.inf"
+ RET=$?
+ if [ "$RET" = "0" ]; then # all fine
+ rm "${FBASE}.started"
+ else
+ touch "${FBASE}.fail"
+ ./next_dest "${DEST}"
+ fi
+ export FBASE=
+ sleep 5
19 next_dest
@@ -0,0 +1,19 @@
+lockfile -l 10 next_dest.lock
+if [ "$#" = "1" ]; then
+ sed -i "1i ${1}" next_dest.txt
+ ret="$(head -1 next_dest.txt)"
+ [[ -z "$ret" ]] && ret="$(head -1 dests.txt)"
+ TMP="$(grep -v "$ret" < next_dest.txt)"
+ if [ -z "$TMP" ]; then # Add another item to next_dest.txt from dests
+ echo "$(grep -A1 "$ret" < dests.txt | grep -v "$ret" )" > next_dest.txt
+ else
+ echo "$TMP" > next_dest.txt
+ fi
+ echo $ret
+rm -f next_dest.lock
BIN  ryanaid.jar
Binary file not shown
BIN  ryanaid_lib/selenium-java-2.0a7-srcs.jar
Binary file not shown
BIN  ryanaid_lib/selenium-java-2.0a7.jar
Binary file not shown
BIN  ryanaid_lib/selenium-server-2.0a7-srcs.jar
Binary file not shown
BIN  ryanaid_lib/selenium-server-2.0a7.jar
Binary file not shown
BIN  ryanaid_lib/selenium-server-standalone-2.0a7.jar
Binary file not shown
49 src/
@@ -0,0 +1,49 @@
+package ryanaid;
+import java.util.Calendar;
+import java.util.Scanner;
+import java.util.regex.MatchResult;
+public class Flight {
+ public Calendar departDate;
+ public Calendar returnTime;
+ public Calendar created;
+ public int seatsLeft = 0; // 0 = max
+ public String price;
+ public String from;
+ public String to;
+ public Flight(String from, String to, Calendar departDate, String returnTimeStr, String price, int seatsLeft) {
+ created = Calendar.getInstance();
+ this.departDate = (Calendar) departDate.clone();
+ returnTime = (Calendar) departDate.clone();
+ // returnTimeStr is HH:MM format
+ Scanner returnTime_sc = new Scanner(returnTimeStr);
+ returnTime_sc.findInLine("(\\d+):(\\d+)");
+ MatchResult rt = returnTime_sc.match();
+ returnTime.set(Calendar.HOUR_OF_DAY, Integer.parseInt(;
+ returnTime.set(Calendar.MINUTE, Integer.parseInt(;
+ if (returnTime.before(departDate)) {
+ // Lands after midnight or going east (time zone change).
+ // If time difference is more than 12 hours,
+ // then tomorrow
+ // else - do nothing.
+ long diff = departDate.getTimeInMillis() - returnTime.getTimeInMillis();
+ if (diff > 1000 * 60 * 60 * 12) {
+ returnTime.add(Calendar.DAY_OF_YEAR, 1);
+ }
+ }
+ // counting return time.
+ this.price = price;
+ this.seatsLeft = seatsLeft;
+ this.from = from;
+ = to;
+ }
+ public String toString() {
+ return from+"-"+to+"|"+H.humanTime(departDate)+"|"+H.humanTime(returnTime)+
+ "|"+price+"|"+seatsLeft+"|"+H.humanTimeSec(created);
+ }
110 src/
@@ -0,0 +1,110 @@
+package ryanaid;
+import java.util.Calendar;
+import java.util.Scanner;
+import java.util.logging.*;
+import java.util.regex.MatchResult;
+import java.util.Map;
+public class Go1 {
+ private static Logger logger = Logger.getLogger("ryanair");
+ public static void main(String[] args) {
+ RyanairLogic crawler = null;
+ try {
+ logger.log(Level.INFO, "Starting Browser");
+ int wait_ms = 2000; // Wait after a click, ms
+ int check_days = 3650; // By default check for the next 10 years
+ //int check_days = 15; // By default check for the next 10 years
+ //String from = "Kaunas (KUN)";
+ //String to = "Edinburgh (EDI)";
+ String from = "KUN";
+ String to = "GRO";
+ //String to = "Frankfurt-Hahn (HHN)";
+ Integer socks_port = 0;
+ String socks_host = "";
+ Calendar date_from = Calendar.getInstance();
+ date_from.add(Calendar.DAY_OF_YEAR, 1); // start tomorrow by default
+ Map<String, String> env = System.getenv();
+ if (env.containsKey("date_from")) {
+ Scanner date_from_sc = new Scanner(env.get("date_from"));
+ date_from_sc.findInLine("(\\d+)-(\\d+)-(\\d+)");
+ MatchResult res = date_from_sc.match();
+ date_from.set(
+ Integer.parseInt(,
+ Integer.parseInt(,
+ Integer.parseInt(
+ );
+ }
+ if (env.containsKey("check_days")) check_days = Integer.parseInt(env.get("check_days"));
+ if (env.containsKey("from")) from = env.get("from");
+ if (env.containsKey("to")) to = env.get("to");
+ if (env.containsKey("wait_ms")) wait_ms = Integer.parseInt(env.get("wait_ms"));
+ Calendar dateTo = Calendar.getInstance();
+ dateTo.add(Calendar.DAY_OF_YEAR, check_days); // add this # of days
+ logger.log(Level.INFO, "Checking up to " + H.humanTime(dateTo));
+ if (env.containsKey("socks_host")) socks_host = env.get("socks_host");
+ if (env.containsKey("socks_port")) socks_port = Integer.parseInt(env.get("socks_port"));
+ try {
+ crawler = new RyanairLogic("FirefoxFirebug", socks_port, socks_host);
+ } catch (Exception e) {
+ logger.log(Level.INFO, "FirefoxFirebug crashed, opening w/o one");
+ crawler = new RyanairLogic("Firefox", socks_port, socks_host);
+ }
+ crawler.waitAfterClick = wait_ms; // Wait after clicking anything, ms
+ crawler.getFlights(from, to, date_from);
+ while (crawler.hasNextFlight()) {
+ Flight flight = null;
+ try {
+ flight = crawler.nextFlight();
+ } catch (PleaseFlightAgainException e) {
+ logger.log(Level.WARNING, "Something went wrong fetching the flight, fetching another one");
+ continue;
+ } catch (PleaseMakeNewSearchException e) {
+ logger.log(Level.INFO, e.getMessage());
+ crawler.getFlights();
+ continue;
+ } catch (PleaseNextWeekException e) {
+ crawler.next_week = true;
+ continue;
+ } catch (PleaseStopThisSearchException e) {
+ logger.log(Level.SEVERE, e.getMessage());
+ System.err.println(crawler.getCurrentUrl()+"\n\n"+crawler.getPageSource());
+ crawler.close();
+ System.exit(0);
+ }
+ System.out.println(flight);
+ if (crawler.date_now.after(dateTo)) {
+ logger.log(Level.INFO, "This was the last day we needed. Exiting");
+ crawler.close();
+ System.exit(0);
+ }
+ H.waitALittle(wait_ms);
+ }
+ } catch (Exception e) {
+ logger.log(Level.SEVERE, "Application crashed. Here is the trace");
+ e.printStackTrace();
+ logger.log(Level.SEVERE, "Here is the HTML:");
+ try {
+ System.err.println(crawler.getCurrentUrl()+"\n\n"+crawler.getPageSource());
+ } catch (Exception e2) {
+ logger.log(Level.SEVERE, "Printing page source failed. Something is REALLY wrong here.");
+ }
+ crawler.close();
+ System.exit(1);
+ }
+ crawler.close();
+ }
26 src/
@@ -0,0 +1,26 @@
+package ryanaid;
+import java.util.Calendar;
+public class H {
+ public static String twoDigitNum(int number) {
+ String number_str = number+"";