Skip to content
This repository has been archived by the owner on Mar 13, 2018. It is now read-only.

Commit

Permalink
Using a shell script to overcome PhantomJS segfault when loading too …
Browse files Browse the repository at this point in the history
…many URLs.
  • Loading branch information
MattiSG committed Jun 15, 2012
1 parent 97d00a3 commit 77f7b7d
Show file tree
Hide file tree
Showing 3 changed files with 251 additions and 22 deletions.
61 changes: 39 additions & 22 deletions calendar-extractor.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,18 @@ function extractSummary(result) {
}

function splitTime(result) {
var split = /([0-9]{1,2})h? ?-? ?([0-9]{0,2})h?/.exec(result.time);
var split = /([0-9]{1,2})h?([0-9]{2})? ?-? ?([0-9]{1,2})?h?([0-9]{2})?/.exec(result.time);
if (split) {
result.time = {
start: split[1],
end: split[2]
start: (split[1] || '00') + (split[2] || '00'),
end: (split[3] || '00') + (split[4] || '00')
}
}
}

var loadEvent = function loadEvent(url, callback) {
casper.thenOpen(url, function buildEvent() {
console.log('accessing', url);
var result = {
url: url
}
Expand All @@ -97,7 +98,24 @@ var iCalHeader = function iCalHeader() {
var result = [
'BEGIN:VCALENDAR',
'VERSION:2.0',
'PRODID:MattiSG_FuturEnSeine'
'PRODID:MattiSG_FuturEnSeine',
'BEGIN:VTIMEZONE',
'TZID:Europe/Paris',
'BEGIN:DAYLIGHT',
'TZOFFSETFROM:+0100',
'RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=-1SU',
'DTSTART:19810329T020000',
'TZNAME:GMT+02:00',
'TZOFFSETTO:+0200',
'END:DAYLIGHT',
'BEGIN:STANDARD',
'TZOFFSETFROM:+0200',
'RRULE:FREQ=YEARLY;BYMONTH=10;BYDAY=-1SU',
'DTSTART:19961027T030000',
'TZNAME:GMT+01:00',
'TZOFFSETTO:+0100',
'END:STANDARD',
'END:VTIMEZONE'
];

return result.join('\n');
Expand All @@ -119,10 +137,13 @@ var exportToICal = function exportToICal(event) {

if (event.date && event.time && event.time.start)
result.push('DTSTART:201206' + event.date // easy: the date necessarily has two digits
+ 'T' + event.time.start + '0000Z');
+ 'T' + event.time.start
+ '00');

if (event.date && event.time && event.time.end)
result.push('DTEND:' + event.date + 'T' + event.time.start);
if (event.date && event.time && event.time.end != '0000')
result.push('DTEND:201206' + event.date
+ 'T' + event.time.end
+ '00');

result.push('END:VEVENT');

Expand All @@ -136,28 +157,24 @@ var iCalFooter = function iCalFooter() {

casper.start('http://www.futur-en-seine.fr/calendrier/', function main() {
var events = findAllEvents();
var urls = events.urls;
var urls = events.urls;

console.log(iCalHeader());
// console.log(iCalHeader());
for (var i = 0; i < urls.length; i++) {
loadEvent(urls[i], function(values) {
console.log(urls[i]);
/* loadEvent(urls[i], function(values) {
if (values.date)
console.log(exportToICal(values));
// else
// console.error('Missing date for', values.url, ' :(');
/* for (var key in values) {
if (values.hasOwnProperty(key)) {
if (values[key])
console.log(key, '-> "' + values[key] + '"');
}
}
*/
});
console.log(
exportToICal(values)
);
else
console.error('Missing date for', values.url, ' :(');
});*/
}
});

casper.run(function() {
console.log(iCalFooter());
// console.log(iCalFooter());

casper.exit();
});
176 changes: 176 additions & 0 deletions calendar-generator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
var casper = require('casper').create({
verbose: true
});

var findAllEvents = function findAllEvents() {
return casper.evaluate(function() {
var result = {
urls: []
}

$('.box').children('a').each(function(index, elm) {
result.urls.push($(elm).attr('href'));
});

return result;
});
}

var handlers = {
'.adresse + div + div': /Type : (.+)/,
'.adresse + div + div + div': /Evénement : (.+)/,
'.adresse + div + div + div + div': /Tarif : (.+)/,
}

function extractData(selector, type) {
var result = casper.fetchText(selector);

if (result)
result = new RegExp(type + ' : (.+)').exec(result);

if (result)
result = result[1].trim();

return result;
}

function extractPlace(result) {
result.place = extractData('.lieu', 'Lieu');

var address = extractData('.address', 'Adresse');
if (address)
result.place += ', ' + address;
}

function extractDate(result) {
result.date = extractData('.dates', 'Date');

if (result.date)
result.date = result.date.replace(/ ?juin/i, '');
}

function extractTime(result) {
var titleExtractedTime = / \/ (.+)/.exec(result.title);

if (titleExtractedTime) {
titleExtractedTime = titleExtractedTime[1];
result.title = result.title.replace(' / ' + titleExtractedTime, '');
}

result.time = (titleExtractedTime || extractData('.adresse + div', 'Horaires'));
}

function extractSummary(result) {
result.summary = casper.fetchText('.fiche-header-text div p:first-child');
}

function splitTime(result) {
var split = /([0-9]{1,2})h?([0-9]{2})? ?-? ?([0-9]{1,2})?h?([0-9]{2})?/.exec(result.time);
if (split) {
result.time = {
start: (split[1] || '00') + (split[2] || '00'),
end: (split[3] || '00') + (split[4] || '00')
}
}
}

var loadEvent = function loadEvent(url, callback) {
casper.thenOpen(url, function buildEvent() {
var result = {
url: url
}

result.title = casper.fetchText('h1');

extractSummary(result);
extractPlace(result);
extractDate(result);
extractTime(result);
splitTime(result);

callback(result);
});
}


var iCalHeader = function iCalHeader() {
var result = [
'BEGIN:VCALENDAR',
'VERSION:2.0',
'PRODID:MattiSG_FuturEnSeine',
'BEGIN:VTIMEZONE',
'TZID:Europe/Paris',
'BEGIN:DAYLIGHT',
'TZOFFSETFROM:+0100',
'RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=-1SU',
'DTSTART:19810329T020000',
'TZNAME:GMT+02:00',
'TZOFFSETTO:+0200',
'END:DAYLIGHT',
'BEGIN:STANDARD',
'TZOFFSETFROM:+0200',
'RRULE:FREQ=YEARLY;BYMONTH=10;BYDAY=-1SU',
'DTSTART:19961027T030000',
'TZNAME:GMT+01:00',
'TZOFFSETTO:+0100',
'END:STANDARD',
'END:VTIMEZONE'
];

return result.join('\n');
}

var exportToICal = function exportToICal(event) {
var result = [
'BEGIN:VEVENT',
'UID:' + event.title.replace(/ /g, '_'),
'SUMMARY:' + event.title,
'URL:' + event.url
];

if (event.summary)
result.push('DESCRIPTION:' + event.summary.replace(/\n/g, ' '));

if (event.place)
result.push('LOCATION:' + event.place);

if (event.date && event.time && event.time.start)
result.push('DTSTART:201206' + event.date // easy: the date necessarily has two digits
+ 'T' + event.time.start
+ '00');

if (event.date && event.time && event.time.end != '0000')
result.push('DTEND:201206' + event.date
+ 'T' + event.time.end
+ '00');

result.push('END:VEVENT');

return result.join('\n');
}

var iCalFooter = function iCalFooter() {
return 'END:VCALENDAR';
}



casper.cli.drop("cli");
casper.cli.drop("casper-path");

casper.start('http://google.fr', function() {
for (var i in casper.cli.args) {
if (casper.cli.args.hasOwnProperty(i)) {
loadEvent(casper.cli.args[i], function(values) {
if (values.date)
console.log(
exportToICal(values)
);
else
console.error('Missing date for', values.url, ' :(');
});
}
}
});

casper.run();
36 changes: 36 additions & 0 deletions dumpall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

FILE=FuturEnSeine.ics

echo 'BEGIN:VCALENDAR' > $FILE
echo 'VERSION:2.0' >> $FILE
echo 'PRODID:MattiSG_FuturEnSeine' >> $FILE
echo 'BEGIN:VTIMEZONE' >> $FILE
echo 'TZID:Europe/Paris' >> $FILE
echo 'BEGIN:DAYLIGHT' >> $FILE
echo 'TZOFFSETFROM:+0100' >> $FILE
echo 'RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=-1SU' >> $FILE
echo 'DTSTART:19810329T020000' >> $FILE
echo 'TZNAME:GMT+02:00' >> $FILE
echo 'TZOFFSETTO:+0200' >> $FILE
echo 'END:DAYLIGHT' >> $FILE
echo 'BEGIN:STANDARD' >> $FILE
echo 'TZOFFSETFROM:+0200' >> $FILE
echo 'RRULE:FREQ=YEARLY;BYMONTH=10;BYDAY=-1SU' >> $FILE
echo 'DTSTART:19961027T030000' >> $FILE
echo 'TZNAME:GMT+01:00' >> $FILE
echo 'TZOFFSETTO:+0100' >> $FILE
echo 'END:STANDARD' >> $FILE
echo 'END:VTIMEZONE' >> $FILE

for url in $(cat urls.txt) # $(casperjs calendar-extractor.js)
do
echo "$url"
result=$(casperjs calendar-generator.js $url)

if ! echo $result | grep 'Missing date'
then echo $result >> $FILE
fi
done

echo 'END:VCALENDAR' >> $FILE

0 comments on commit 77f7b7d

Please sign in to comment.