Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
1213 lines (1058 sloc) 46.7 KB
<?php
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once("utils.inc");
require_once("settings.inc");
require_once("crawls.inc");
// default table names
$gPagesTable = "pages";
$gRequestsTable = "requests";
$gStatusTable = "status";
$gUrlsTable = "urls";
$gUrlsChangeTable = "urlschange";
$gStatsTable = "stats";
$gCrawlsTable = "crawls";
$gSettingsTable = "settings"; // for storing misc settings persistently
$gCdfTable = "cdf";
// Desktop tables
$gPagesTableDesktop = $gPagesTable;
$gRequestsTableDesktop = $gRequestsTable;
$gStatusTableDesktop = $gStatusTable;
$gUrlsTableDesktop = "urls";
$gStatsTableDesktop = $gStatsTable;
$gUrlsChangeTableDesktop = $gUrlsChangeTable;
// Mobile tables
$gPagesTableMobile = $gPagesTable . "mobile";
$gRequestsTableMobile = $gRequestsTable . "mobile";
$gUrlsTableMobile = "urls";
$gStatusTableMobile = $gStatusTable . "mobile";
$gStatsTableMobile = $gStatsTable; // share the data table - a first step toward a single DB
// Chrome tables
$gPagesTableChrome = $gPagesTable . "chrome";
$gRequestsTableChrome = $gRequestsTable . "chrome";
$gUrlsTableChrome = "urls";
$gStatusTableChrome = $gStatusTable . "chrome";
$gStatsTableChrome = $gStatsTable; // share the data table - a first step toward a single DB
// Android tables
$gPagesTableAndroid = $gPagesTable . "android";
$gRequestsTableAndroid = $gRequestsTable . "android";
$gUrlsTableAndroid = "urls";
$gStatusTableAndroid = $gStatusTable . "android";
$gStatsTableAndroid = $gStatsTable; // share the data table - a first step toward a single DB
// Iphone tables
$gPagesTableIphone = $gPagesTable . "iphone";
$gRequestsTableIphone = $gRequestsTable . "iphone";
$gUrlsTableIphone = "urls";
$gStatusTableIphone = $gStatusTable . "iphone";
$gStatsTableIphone = $gStatsTable; // share the data table - a first step toward a single DB
// IE tables
$gPagesTableIe = $gPagesTable . "ie";
$gRequestsTableIe = $gRequestsTable . "ie";
$gUrlsTableIe = "urls";
$gStatusTableIe = $gStatusTable . "ie";
$gStatsTableIe = $gStatsTable; // share the data table - a first step toward a single DB
// Dev tables
$gPagesTableDev = $gPagesTable . "dev";
$gRequestsTableDev = $gRequestsTable . "dev";
$gStatusTableDev = $gStatusTable . "dev";
$gStatsTableDev = $gStatsTable . "dev";
$gUrlsTableDev = $gUrlsTable . "dev";
$gUrlsChangeTableDev = $gUrlsChangeTable; // . "dev";
//
// HERE'S WHERE WE CHANGE THE DEFAULT TABLE NAMES
// DEPENDING ON WHETHER WE'RE DEV OR MOBILE
//
if ( $gbDev ) {
// Use a dev version of the database tables if "dev/" is in the path.
$gPagesTable = $gPagesTableDev;
$gRequestsTable = $gRequestsTableDev;
$gStatusTable = $gStatusTableDev;
$gStatsTable = $gStatsTableDev;
$gUrlsTable = $gUrlsTableDev;
$gUrlsChangeTable = $gUrlsChangeTableDev;
}
else if ( $gbMobile ) {
// Use a mobile version of the database tables if "mobile" is in the path.
$gPagesTable = $gPagesTableMobile;
$gRequestsTable = $gRequestsTableMobile;
$gStatusTable = $gStatusTableMobile;
$gStatsTable = $gStatsTableMobile;
}
else if ( $gbChrome ) {
// Use a chrome version of the database tables if "chrome" is in the path.
$gPagesTable = $gPagesTableChrome;
$gRequestsTable = $gRequestsTableChrome;
$gStatusTable = $gStatusTableChrome;
$gStatsTable = $gStatsTableChrome;
}
else if ( $gbAndroid ) {
// Use a android version of the database tables if "android" is in the path.
$gPagesTable = $gPagesTableAndroid;
$gRequestsTable = $gRequestsTableAndroid;
$gStatusTable = $gStatusTableAndroid;
$gStatsTable = $gStatsTableAndroid;
}
else if ( $gbIphone ) {
// Use a iphone version of the database tables if "iphone" is in the path.
$gPagesTable = $gPagesTableIphone;
$gRequestsTable = $gRequestsTableIphone;
$gStatusTable = $gStatusTableIphone;
$gStatsTable = $gStatsTableIphone;
}
else if ( $gbIe ) {
// Use a iphone version of the database tables if "iphone" is in the path.
$gPagesTable = $gPagesTableIe;
$gRequestsTable = $gRequestsTableIe;
$gStatusTable = $gStatusTableIe;
$gStatsTable = $gStatsTableIe;
}
// We restrict the range of queries by adding a date range.
// Sometimes the date range is affected by querystring params.
// We also avoid old, undesired data with this.
function dateRange($bDefault = false) {
global $gbMobile, $gbDev;
// Here's the scoop: We use $dateRange as a nice global throughout most of our queries to limit
// the scope of data. BUT...this is a problem when we want to, for example, present the user with
// a list of labels to choose from - we don't want that list of labels to be limited.
// So in some places we use $dateRangeDefault to get the WHOLE range of data.
$dateRangeDefault = ( isPrivateInstance() ? "pageid >= 1" :
( $gbMobile ? "pageid >= 607" :
( $gbDev ? "pageid >= 10281" : "pageid >= 1" ) ) );
if ( $bDefault ) {
return $dateRangeDefault;
}
else {
// Query ALL the date ranges but avoid some undesired data (old, small runs).
$dateRange = $dateRangeDefault;
if ( minLabel() ) {
$dateRange .= " and pageid >= " . labelMinid(minLabel());
}
if ( maxLabel() ) {
$dateRange .= " and pageid <= " . labelMaxid(maxLabel());
}
return $dateRange;
}
}
function dateRangeCrawls($bDefault = false) {
global $gbMobile;
// Here's the scoop: We use $dateRange as a nice global throughout most of our queries to limit
// the scope of data. BUT...this is a problem when we want to, for example, present the user with
// a list of labels to choose from - we don't want that list of labels to be limited.
// So in some places we use $dateRangeDefault to get the WHOLE range of data.
$dateRangeDefault = ( isPrivateInstance() ? "minPageid >= 1" :
( $gbMobile ? "minPageid >= 607" :
( $gbDev ? "minPageid >= 10281" : "minPageid >= 1" ) ) );
if ( $bDefault ) {
return $dateRangeDefault;
}
else {
// Query ALL the date ranges but avoid some undesired data (old, small runs).
$dateRange = $dateRangeDefault;
if ( minLabel() ) {
$dateRange .= " and minPageid >= " . labelMinid(minLabel());
}
if ( maxLabel() ) {
$dateRange .= " and maxPageid <= " . labelMaxid(maxLabel());
}
return $dateRange;
}
}
function dbtable($table, $device="", $label="", $slice="") {
global $gbChrome, $gbAndroid, $gbIe, $gbIphone, $gbMobile;
if ( FALSE !== stripos($device, "android") || $gbMobile ) {
$table .= "mobile";
}
else if ( isDev() ) {
$table .= "dev";
}
else if ( $gbChrome ) {
$table .= "chrome";
}
else if ( $gbAndroid ) {
$table .= "android";
}
else if ( $gbIe ) {
$table .= "ie";
}
else if ( $gbIphone ) {
$table .= "iphone";
}
return $table;
}
// These are interim helper functions for the massive mess of db table global vars
function pagesTable($label="", $slice="", $device="") {
return dbtable("pages", $device, $label, $slice);
}
// Nothing on the production website should need access to the requests table!!
function requestsTable($label="", $slice="", $device="") {
return dbtable("requests", $device, $label, $slice);
}
function statsTable($label="", $slice="", $device="") {
$table = "stats";
if ( isDev() ) {
$table .= "dev";
}
return $table;
}
function crawlsTable() {
return "crawls";
}
// return a where condition to select the appropriate URL slice based on pageid
function sliceCond($label, $slice, $device, $url = "") {
global $gaTop100, $gaTop1000, $gArchive;
$pagesTable = pagesTable($label, $slice, $device);
$dateRange = dateRange();
$crawl = getCrawl($label, $gArchive, $device);
$minid = $crawl['minPageid'];
$maxid = $crawl['maxPageid'];
$sliceCond = "pageid>=$minid and pageid<=$maxid"; // "All"
if ( 0 === strpos($slice, "Top") ) {
// eg, "Top100" or "Top1000"
if ( 1322727953 <= $crawl['startedDateTime'] ) {
// The first crawl with rank available is "Dec 1 2011" (startedDateTime = 1322727953)
$sliceCond .= " and rank > 0 and rank <= " . substr($slice, 3);
}
else {
// Crawls that occurred BEFORE we started saving rank.
if ( "Top100" === $slice || "Top1000" === $slice ) {
// We'll use some old global variables for Top100 and Top1000.
$urls = ( "Top100" === $slice ? $gaTop100 : $gaTop1000 );
$query = "select pageid from $pagesTable where $sliceCond and url in ('" . implode("','", $urls) . "');";
$result = doQuery($query);
$aPageids = array();
while ( $row = mysql_fetch_assoc($result) ) {
$aPageids[] = $row['pageid'];
}
mysql_free_result($result);
$sliceCond = "pageid in (" . implode(",", $aPageids) . ")";
}
else {
// If you've reached this block it means we added something higher
// than "Top1000" (like Top10000 or Top100000) AND we're dealing
// with a crawl that does NOT have ranks. Choices include:
// 1. Include all the pages. These crawls have 50K pages or less,
// so it might not be too far off.
// 2. Only include the top 1000 using the global variable. This
// would mean we'd compare 1000 to 10,000 or 100,000 - bad.
// 3. Use the current rankings from the "urls" table. This would
// mean that the set of top URLs would constantly change.
// I'm going with #1 - include all the pages - which means we don't
// have to add anything more to sliceCond.
}
}
}
else if ( "url" === $slice && isset($url) ) {
$sliceCond .= " and url = '$url'";
}
return $sliceCond;
}
//
//
// MYSQL WRAPPER
//
//
// Reuse a single DB connection for the entire session.
$gDBConnection = null;
function doSimpleCommand($cmd, $link = null) {
global $gMysqlDb;
if ( ! $link ) {
$link = getDBConnection();
}
if ( mysql_select_db($gMysqlDb) ) {
//dprint("doSimpleCommand: $cmd");
$result = mysql_query($cmd, $link);
//mysql_close($link); // the findCorrelation code relies on the link not being closed
if ( ! $result ) {
dprint("ERROR in doSimpleCommand: '" . mysql_error() . "' for command: " . $cmd);
}
}
}
function doQuery($query, $link = null) {
global $gMysqlDb;
if ( ! $link ) {
$link = getDBConnection();
}
if ( mysql_select_db($gMysqlDb) ) {
//dprint("doQuery: $query");
$result = mysql_query($query, $link);
//mysql_close($link); // the findCorrelation code relies on the link not being closed
if ( FALSE === $result ) {
dprint("ERROR in doQuery: '" . mysql_error() . "' for query: " . $query);
}
return $result;
}
return null;
}
// return the first row
function doRowQuery($query, $link = null) {
$row = NULL;
$result = doQuery($query, $link);
if ( $result ) {
$row = mysql_fetch_assoc($result);
mysql_free_result($result);
}
return $row;
}
// return the first value from the first row
function doSimpleQuery($query, $link = null) {
$value = NULL;
$result = doQuery($query, $link);
if ( $result ) {
$row = mysql_fetch_assoc($result);
if ( $row ) {
$aKeys = array_keys($row);
$value = $row[$aKeys[0]];
}
mysql_free_result($result);
}
return $value;
}
// This takes an INSERT command and returns the result of last_insert_id().
// This is only useful if the INSERT command causes an AUTO_INCREMENT column to be updated.
// We have to do this because last_insert_id only works with the same connection.
// TODO - investigate only using one connection
function doLastInsertId($cmd, $link = null) {
global $gMysqlServer, $gMysqlDb, $gMysqlUsername, $gMysqlPassword;
if ( ! $link ) {
$link = getDBConnection();
}
$last_insert_id = null;
if ( mysql_select_db($gMysqlDb) ) {
//dprint("doSimpleCommand: $cmd");
$result = mysql_query($cmd, $link);
//mysql_close($link); // the findCorrelation code relies on the link not being closed
if ( ! $result ) {
dprint("ERROR in doSimpleCommand: '" . mysql_error() . "' for command: " . $cmd);
}
else {
$last_result = mysql_query("select last_insert_id()", $link);
$row = mysql_fetch_array($last_result);
$last_insert_id = $row[0];
}
}
return $last_insert_id;
}
// Return a MySQL DB Connection.
// Cache this in a global variable for efficiency.
function getDBConnection() {
global $gDBConnection, $gMysqlServer, $gMysqlUsername, $gMysqlPassword;
// CVSNO - figure out why forking processes screws up this global variable & how to fix (mysql_ing?)
return mysql_connect($gMysqlServer, $gMysqlUsername, $gMysqlPassword, $new_link=true);
if ( ! $gDBConnection ) {
$gDBConnection = mysql_connect($gMysqlServer, $gMysqlUsername, $gMysqlPassword, $new_link=true);
}
return $gDBConnection;
}
function tableExists($tablename) {
return ( $tablename == doSimpleQuery("show tables like '$tablename';") );
}
// scary
function dropTable($tablename) {
doSimpleCommand("drop table $tablename;");
}
function columnExists($column, $table) {
// We reference the "Field" value returned from "show columns".
return doSimpleQuery("show columns from $table where Field = '$column';");
}
/*******************************************************************************
SCHEMA CHANGES:
This is a record of changes to the schema and how the tables were updated
in place.
12/1/10 - Added the "pageid" index to requestsdev.
This made the aggregateStats function 10x faster during import.
mysql> create index pageid on requestsdev (pageid);
7/21/2011 - Added the "rank" column to pages table:
mysql> alter table pagesdev add column rank int(10) unsigned after PageSpeed;
Oct 27, 2011 - Remove "harfile" column and reset unique index. Shorten columns.
mysql> alter table pagesdev modify column archive varchar (16) not null;
mysql> alter table pagesdev modify column label varchar (32) not null;
mysql> drop index startedDateTime on pagesdev;
mysql> alter table pagesdev add unique key (label, urlShort);
mysql> alter table pagesdev drop column harfile;
Nov 3, 2011 - add "rank" column to status table
mysql> drop table statusdev; drop table statusmobile;
Nov 16, 2011 - add perFonts column to stats table
mysql> alter table statsdev add column perFonts int(4) unsigned after perFlash;
April 5, 2012 - add urlhash to urls table for faster lookup
("urlhash" is a substring of the URL's MD5 hash converted to base-10)
mysql> alter table urls add column urlhash int(8) unsigned first;
mysql> update urls set urlhash = conv(substring(md5(urlOrig),1,4),16,10);
mysql> create index urlhash on urls (urlhash);
Oct 26, 2012 - add new stats columns
mysql> alter table statsdev add column renderStart int(10) unsigned after PageSpeed;
mysql> alter table statsdev add column onLoad int(10) unsigned after renderStart;
mysql> alter table statsdev add column perCompressed int(4) unsigned after perHttps;
mysql> alter table statsdev modify perHttps int(4) unsigned;
Dec 2012:
alter table pages drop column title;
[pagesdev, pagesmobile]
alter table pages drop column urlHtmlShort;
[pagesdev, pagesmobile]
alter table requests drop column redirectUrlShort;
[requestsdev, requestsmobile]
alter table pages drop column urlHtml;
[pagesdev, pagesmobile]
alter table pages add column TTFB smallint unsigned AFTER startedDateTime;
[pagesdev, pagesmobile]
alter table pages add column reqGif smallint unsigned not null after reqImg, ADD COLUMN reqJpg smallint unsigned not null after reqGif, ADD COLUMN reqPng smallint unsigned not null after reqJpg, ADD COLUMN reqFont smallint unsigned not null after reqPng;
[pagesdev, pagesmobile]
alter table pages add column bytesGif int(10) unsigned not null after bytesImg, ADD COLUMN bytesJpg int(10) unsigned not null after bytesGif, ADD COLUMN bytesPng int(10) unsigned not null after bytesJpg, ADD COLUMN bytesFont int(10) unsigned not null after bytesPng;
[pagesdev, pagesmobile]
alter table pages add column maxageMore smallint unsigned not null after numDomains, add column maxage365 smallint unsigned not null after numDomains, add column maxage30 smallint unsigned not null after numDomains, add column maxage1 smallint unsigned not null after numDomains, add column maxage0 smallint unsigned not null after numDomains, add column maxageNull smallint unsigned not null after numDomains, add column numDomElements mediumint unsigned not null after numDomains, add column numCompressed smallint unsigned not null after numDomains, add column numHttps smallint unsigned not null after numDomains, add column numGlibs smallint unsigned not null after numDomains, add column numErrors smallint unsigned not null after numDomains, add column numRedirects smallint unsigned not null after numDomains, add column maxDomainReqs smallint unsigned not null after numDomains;
[pagesdev, pagesmobile]
alter table pages add column bytesHtmlDoc mediumint unsigned not null AFTER bytesOther;
[pagesdev, pagesmobile]
alter table pages add column fullyLoaded int(10) unsigned AFTER onLoad;
[pagesdev, pagesmobile]
alter table pages add column cdn varchar (64) AFTER urlShort;
[pagesdev, pagesmobile]
alter table pages add column SpeedIndex mediumint unsigned AFTER PageSpeed;
[pagesdev, pagesmobile]
alter table pages add column visualComplete int(10) unsigned AFTER fullyLoaded;
[pagesdev, pagesmobile]
alter table pages add column gzipTotal int unsigned not null AFTER maxageMore, add column gzipSavings int unsigned not null after gzipTotal;
[pagesdev, pagesmobile]
alter table requests add column expAge int unsigned AFTER respCookieLen;
[requestsdev, requestsmobile]
Jan 7, 2013 - add reqFont, bytesFont columns to stats table
alter table statsdev add column reqFont float unsigned after reqFlash;
alter table statsdev add column bytesFont int(10) unsigned after bytesFlash;
Jan 9 2013 - make expAge not null
alter table requests modify expAge int unsigned not null;
Feb 5 2013 - add new stats to stats table
alter table stats add column bytesHtmlDoc mediumint unsigned after bytesOther, add column perCdn int(4) unsigned after perGlibs, add column gzipSavings int unsigned not null after bytesHtmlDoc, add column gzipTotal int unsigned not null after bytesHtmlDoc, add column numDomElements mediumint unsigned not null after numDomains, add column maxDomainReqs smallint unsigned not null after numDomains, add column fullyLoaded int(10) unsigned after onLoad, add column visualComplete int(10) unsigned after onLoad, add column TTFB int(10) unsigned after PageSpeed, add column SpeedIndex mediumint unsigned after PageSpeed;
[statsdev]
July 2013 -
pages:
add columns: crawlid, urlhash
requests:
crawls:
add columns: notes
stats:
add columns: crawlid
April 20, 2014 - add crawlid index
create index crawlid on pages (crawlid);
May 12, 2014 - add _connections and _cdn_provider column
alter table pages add column _connections int(4) unsigned not null;
alter table requests add column _cdn_provider varchar(64);
alter table stats add column _connections int(4) unsigned not null;
May 13, 2014 - add _adult_site
alter table pages add column _adult_site boolean not null;
May 21, 2014 - add _gzip_save
alter table requests add column _gzip_save int(10) unsigned not null;
May 29, 2014 - add new columns based on custom_metrics feature in WebPagetest
alter table pages add column avg_dom_depth int(4) unsigned not null,
add column document_height int(10) unsigned not null,
add column document_width int(10) unsigned not null,
add column localstorage_size int(10) unsigned not null,
add column sessionstorage_size int(10) unsigned not null,
add column num_iframes int(8) unsigned not null,
add column num_scripts int(8) unsigned not null,
add column doctype varchar(255) not null,
add column meta_viewport varchar(255) not null;
alter table stats add column avg_dom_depth int(4) unsigned not null,
add column document_height int(10) unsigned not null,
add column document_width int(10) unsigned not null,
add column localstorage_size int(10) unsigned not null,
add column sessionstorage_size int(10) unsigned not null,
add column num_iframes int(8) unsigned not null,
add column num_scripts int(8) unsigned not null;
June ?, 2014 - add crawlid to requests
alter table requests add column crawlid int unsigned not null;
//TODO create index crawlid on requests (crawlid);
April 23, 2015 - add ext, type, format to requests, and various req & bytes columns to pages & stats.
alter table requests[] add column type varchar (32);
alter table requests[] add column ext varchar (8);
alter table requests[] add column format varchar (8);
alter table pages[] add column reqAudio int(4) unsigned not null;
alter table pages[] add column reqVideo int(4) unsigned not null;
alter table pages[] add column reqText int(4) unsigned not null;
alter table pages[] add column reqXml int(4) unsigned not null;
alter table pages[] add column reqWebp int(4) unsigned not null;
alter table pages[] add column reqSvg int(4) unsigned not null;
alter table pages[] add column bytesAudio int(10) unsigned not null;
alter table pages[] add column bytesVideo int(10) unsigned not null;
alter table pages[] add column bytesText int(10) unsigned not null;
alter table pages[] add column bytesXml int(10) unsigned not null;
alter table pages[] add column bytesWebp int(10) unsigned not null;
alter table pages[] add column bytesSvg int(10) unsigned not null;
alter table stats[dev] add column reqAudio float unsigned;
alter table stats[dev] add column bytesAudio int(10) unsigned;
alter table stats[dev] add column reqVideo float unsigned;
alter table stats[dev] add column bytesVideo int(10) unsigned;
alter table stats[dev] add column reqText float unsigned;
alter table stats[dev] add column bytesText int(10) unsigned;
alter table stats[dev] add column reqXml float unsigned;
alter table stats[dev] add column bytesXml int(10) unsigned;
alter table stats[dev] add column reqWebp float unsigned;
alter table stats[dev] add column bytesWebp int(10) unsigned;
alter table stats[dev] add column reqSvg float unsigned;
alter table stats[dev] add column bytesSvg int(10) unsigned;
May 29, 2015 - add new columns for custom_metrics to pages[dev, mobile, chrome, android]:
- allow null so we can see if script is failing - not sure if this will cause problems
alter table pages add column num_scripts_async int(8) unsigned,
add column num_scripts_sync int(8) unsigned,
add column usertiming int(8) unsigned;
alter table stats add column num_scripts_async int(8) unsigned not null,
add column num_scripts_sync int(8) unsigned not null,
add column usertiming int(8) unsigned not null;
Dec 9, 2015 - add new columns for SpeedIndex correlations
alter table stats add column SpeedIndexccf1 varchar (32),
add column SpeedIndexccv1 float unsigned,
add column SpeedIndexccf2 varchar (32),
add column SpeedIndexccv2 float unsigned,
add column SpeedIndexccf3 varchar (32),
add column SpeedIndexccv3 float unsigned,
add column SpeedIndexccf4 varchar (32),
add column SpeedIndexccv4 float unsigned,
add column SpeedIndexccf5 varchar (32),
add column SpeedIndexccv5 float unsigned;
*******************************************************************************/
function createTables() {
global $gPagesTable, $gRequestsTable, $gStatusTable, $gStatsTable, $gUrlsTable, $gUrlsChangeTable, $gSettingsTable, $gCdfTable;
if ( ! tableExists($gPagesTable) ) {
$command = "create table $gPagesTable (" .
"pageid int unsigned not null auto_increment" .
", createDate int(10) unsigned not null" .
", archive varchar (16) not null" .
", label varchar (32) not null" . // name of the crawl, eg, "Sep 15 2012"
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", wptid varchar (64) not null" . // webpagetest.org id
", wptrun int(2) unsigned not null" . // webpagetest.org median #
", url text" . // main URL fetched
", urlShort varchar (255)" . // shortened URL to use in index
", urlhash int(8) unsigned" . // hash for faster searching - use getUrlhashCond()
", cdn varchar (64)" . // if the base page is served by a CDN
", startedDateTime int(10) unsigned" . // epoch time when page was crawled
", TTFB smallint unsigned" . // time to first byte of HTML doc response
", renderStart int(10) unsigned" . // when rendering started
", onContentLoaded int(10) unsigned" . // when content loaded (not available in IE)
", onLoad int(10) unsigned" . // window.onload
", fullyLoaded int(10) unsigned" . // the page is fully done (according to WPT)
", visualComplete int(10) unsigned" . //
", PageSpeed int(4) unsigned" . // PageSpeed score (0-100)
", SpeedIndex mediumint unsigned" . // WPT Speed Index score
", rank int(10) unsigned" . // worldwide rank for this URL (according to Alexa.com)
", reqTotal int(4) unsigned not null" . // # of resource requests, etc.
", reqHtml int(4) unsigned not null" .
", reqJS int(4) unsigned not null" .
", reqCSS int(4) unsigned not null" .
", reqImg int(4) unsigned not null" .
", reqGif smallint unsigned not null" .
", reqJpg smallint unsigned not null" .
", reqPng smallint unsigned not null" .
", reqFont smallint unsigned not null" .
", reqFlash int(4) unsigned not null" .
", reqJson int(4) unsigned not null" .
", reqOther int(4) unsigned not null" .
", bytesTotal int(10) unsigned not null" . // # of bytes TRANSFERRED (so may be bigger when uncompressed)
", bytesHtml int(10) unsigned not null" .
", bytesJS int(10) unsigned not null" .
", bytesCSS int(10) unsigned not null" .
", bytesImg int(10) unsigned not null" .
", bytesGif int(10) unsigned not null" .
", bytesJpg int(10) unsigned not null" .
", bytesPng int(10) unsigned not null" .
", bytesFont int(10) unsigned not null" .
", bytesFlash int(10) unsigned not null" .
", bytesJson int(10) unsigned not null" .
", bytesOther int(10) unsigned not null" .
", bytesHtmlDoc mediumint unsigned not null" . // size of the main HTML document
", numDomains int(4) unsigned" . // # of unique domains across all requests
", maxDomainReqs smallint unsigned not null" . // # of requests on the most-used domain
", numRedirects smallint unsigned not null" .
", numErrors smallint unsigned not null" .
", numGlibs smallint unsigned not null" . // # of requests to Google Libraries API
", numHttps smallint unsigned not null" .
", numCompressed smallint unsigned not null" .
", numDomElements mediumint unsigned not null" .
", maxageNull smallint unsigned not null" . // # of responses without a max-age value in Cache-Control
", maxage0 smallint unsigned not null" . // # of responses with max-age=0
", maxage1 smallint unsigned not null" . // # of responses with 0 < max-age <= 1 day
", maxage30 smallint unsigned not null" . // # of responses with 1 < max-age <= 30 days
", maxage365 smallint unsigned not null" . // # of responses with 30 < max-age <= 365 days
", maxageMore smallint unsigned not null" . // # of responses with 365 < max-age
", gzipTotal int unsigned not null" . // # of bytes xferred for resources that COULD have been gzipped
", gzipSavings int unsigned not null" . // # of bytes that could have been saved if ALL gzippable resources were gzipped
", _connections int(4) unsigned not null" . // # of TCP connections created during page load
", _adult_site boolean not null" .
", avg_dom_depth int(4) unsigned not null" .
", document_height int(10) unsigned not null" .
", document_width int(10) unsigned not null" .
", localstorage_size int(10) unsigned not null" .
", sessionstorage_size int(10) unsigned not null" .
", num_iframes int(8) unsigned not null" .
", num_scripts int(8) unsigned not null" .
", doctype varchar(255) not null" .
", meta_viewport varchar(255) not null" .
", reqAudio int(4) unsigned not null" .
", reqVideo int(4) unsigned not null" .
", reqText int(4) unsigned not null" .
", reqXml int(4) unsigned not null" .
", reqWebp int(4) unsigned not null" .
", reqSvg int(4) unsigned not null" .
", bytesAudio int(10) unsigned not null" .
", bytesVideo int(10) unsigned not null" .
", bytesText int(10) unsigned not null" .
", bytesXml int(10) unsigned not null" .
", bytesWebp int(10) unsigned not null" .
", bytesSvg int(10) unsigned not null" .
", num_scripts_async int(8) unsigned" .
", num_scripts_sync int(8) unsigned" .
", usertiming int(8) unsigned" .
", primary key (pageid)" .
", index(urlhash)" .
", index(crawlid)" .
", unique key (label, urlShort)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
if ( ! tableExists($gRequestsTable) ) {
createRequestsTable($gRequestsTable);
}
// Create Status Table
if ( ! tableExists($gStatusTable) ) {
$command = "create table $gStatusTable (" .
"statusid int unsigned not null auto_increment" .
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", url text" .
", location varchar (32) not null" .
", archive varchar (32) not null" .
", label varchar (32) not null" .
", rank int(10) unsigned" .
", status varchar (32) not null" .
", attempts int(2) not null" . // how many times we've tried the URL so we can re-submit failures
", timeOfLastChange int(10) unsigned not null" .
", wptid varchar (64)" .
", wptRetCode varchar (8)" .
", medianRun int(4) unsigned" .
", startRender int(10) unsigned" .
", pagespeedScore int(4) unsigned" .
", primary key (statusid)" .
", index(statusid)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Status Table
$crawlsTable = crawlsTable();
if ( ! tableExists( $crawlsTable ) ) {
$command = "create table $crawlsTable (" .
"crawlid int unsigned not null auto_increment" .
", label varchar (32) not null" .
", archive varchar (32) not null" .
", location varchar (32) not null" . // AKA "device" - the WPT location value
", notes varchar(64)" . // notes like "test" or "js off"
", video boolean not null" . // true = video was recorded
", docComplete boolean not null" . // true = stopped at onload; false = ran later
", fvonly boolean not null" . // true = first view only
", runs int(4) unsigned" . // the number of runs for each URL (we use the median)
", startedDateTime int(10) unsigned" .
", finishedDateTime int(10) unsigned" .
", timeOfLastChange int(10) unsigned not null" .
", passes int(2) unsigned" . // the number of passes thru the URLs - we typically do 2 (retry errors)
", minPageid int unsigned not null" .
", maxPageid int unsigned not null" .
", numUrls int unsigned" . // # of URLs submitted
", numErrors int unsigned" . // # of URLs that failed
", numPages int unsigned" . // # of URLs that resulted in valid pages
", numRequests int unsigned" . // # of requests for all the pages
", primary key (crawlid)" .
", index(label, archive, location)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
if ( ! tableExists($gStatsTable) ) {
$command = "create table $gStatsTable (" .
"label varchar (32) not null" . // "Oct 1 2011"
", crawlid int unsigned not null" . // crawlid joins with "crawls" table
", slice varchar (32) not null" . // "Top100", "Top1000", "intersection", or "All"
", device varchar (32) not null" . // "iphone43", "isim", or "IE8"
", version int(6) unsigned not null" . // the SVN version # so we can track what values to recompute
", numurls int unsigned" . // even "Top100" might only be 98 URLs
// averages across ALL SITES in the slice
", PageSpeed float unsigned" .
", SpeedIndex mediumint unsigned" . // WPT Speed Index score
", TTFB int(10) unsigned" .
", renderStart int(10) unsigned" .
", onLoad int(10) unsigned" .
", visualComplete int(10) unsigned" .
", fullyLoaded int(10) unsigned" .
", numDomains float unsigned" .
", maxDomainReqs smallint unsigned not null" .
", numDomElements mediumint unsigned not null" .
", reqTotal float unsigned" .
", reqHtml float unsigned" .
", reqJS float unsigned" .
", reqCSS float unsigned" .
", reqImg float unsigned" .
", reqGif float unsigned" .
", reqJpg float unsigned" .
", reqPng float unsigned" .
", reqFlash float unsigned" .
", reqFont float unsigned" .
", reqJson float unsigned" .
", reqOther float unsigned" .
", bytesTotal int(10) unsigned" .
", bytesHtml int(10) unsigned" .
", bytesJS int(10) unsigned" . // eg, average total bytes of JS downloaded per page
", bytesCSS int(10) unsigned" .
", bytesImg int(10) unsigned" .
", bytesGif int(10) unsigned" .
", bytesJpg int(10) unsigned" .
", bytesPng int(10) unsigned" .
", bytesFlash int(10) unsigned" .
", bytesFont int(10) unsigned" .
", bytesJson int(10) unsigned" .
", bytesOther int(10) unsigned" .
", bytesHtmlDoc mediumint unsigned" . // size of the main HTML document
", gzipTotal int unsigned not null" . // avg # of bytes xferred for resources that COULD have been gzipped
", gzipSavings int unsigned not null" . // avg # of bytes that could have been saved if ALL gzippable resources were gzipped
// % of sites with at least one of these conditions
", perRedirects int(4) unsigned" .
", perErrors int(4) unsigned" .
", perFlash int(4) unsigned" .
", perFonts int(4) unsigned" .
", perGlibs int(4) unsigned" .
", perCdn int(4) unsigned" .
/*
// % of sites that use this JS library
", perjQuery float unsigned" .
", perYUI float unsigned" .
", perDojo float unsigned" .
", perGA float unsigned" .
", perQuantcast float unsigned" .
", perAddThis float unsigned" .
", perFacebook float unsigned" .
", perGPlusOne float unsigned" .
", perTwitter float unsigned" .
", perShareThis float unsigned" .
*/
// % of requests
", perHttps int(4) unsigned" .
", perCompressed int(4) unsigned" .
// % of sites with max-age in this range
// "maxage30" === maxage > 1 day AND maxage <= 30 days
", maxageNull int(4) unsigned" .
", maxage0 int(4) unsigned" .
", maxage1 int(4) unsigned" .
", maxage30 int(4) unsigned" .
", maxage365 int(4) unsigned" .
", maxageMore int(4) unsigned" .
// correlation coefficients: top 5 correlations for onload and render
", onLoadccf1 varchar (32)" .
", onLoadccv1 float unsigned" .
", onLoadccf2 varchar (32)" .
", onLoadccv2 float unsigned" .
", onLoadccf3 varchar (32)" .
", onLoadccv3 float unsigned" .
", onLoadccf4 varchar (32)" .
", onLoadccv4 float unsigned" .
", onLoadccf5 varchar (32)" .
", onLoadccv5 float unsigned" .
", renderStartccf1 varchar (32)" .
", renderStartccv1 float unsigned" .
", renderStartccf2 varchar (32)" .
", renderStartccv2 float unsigned" .
", renderStartccf3 varchar (32)" .
", renderStartccv3 float unsigned" .
", renderStartccf4 varchar (32)" .
", renderStartccv4 float unsigned" .
", renderStartccf5 varchar (32)" .
", renderStartccv5 float unsigned" .
// columns added later
", _connections int(4) unsigned not null" .
", avg_dom_depth int(4) unsigned not null" .
", document_height int(10) unsigned not null" .
", document_width int(10) unsigned not null" .
", localstorage_size int(10) unsigned not null" .
", sessionstorage_size int(10) unsigned not null" .
", num_iframes int(8) unsigned not null" .
", num_scripts int(8) unsigned not null" .
", reqAudio float unsigned" .
", bytesAudio int(10) unsigned" .
", reqVideo float unsigned" .
", bytesVideo int(10) unsigned" .
", reqText float unsigned" .
", bytesText int(10) unsigned" .
", reqXml float unsigned" .
", bytesXml int(10) unsigned" .
", reqWebp float unsigned" .
", bytesWebp int(10) unsigned" .
", reqSvg float unsigned" .
", bytesSvg int(10) unsigned" .
", num_scripts_sync int(8) unsigned not null" .
", num_scripts_async int(8) unsigned not null" .
", usertiming int(8) unsigned not null" .
", SpeedIndexccf1 varchar (32)" .
", SpeedIndexccv1 float unsigned" .
", SpeedIndexccf2 varchar (32)" .
", SpeedIndexccv2 float unsigned" .
", SpeedIndexccf3 varchar (32)" .
", SpeedIndexccv3 float unsigned" .
", SpeedIndexccf4 varchar (32)" .
", SpeedIndexccv4 float unsigned" .
", SpeedIndexccf5 varchar (32)" .
", SpeedIndexccv5 float unsigned" .
", primary key (label, slice, device)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Urls Table
if ( ! tableExists($gUrlsTable) ) {
$command = "create table $gUrlsTable (" .
"timeAdded int(10) unsigned not null" . // epoch time when this URL was added to the list
", urlhash int(8) unsigned" . // hash for faster searching - use getUrlhashCond()
", urlOrig text character set binary" . // use "binary" so it's case *sensitive*
", urlFixed text" .
", rank int(10) unsigned" .
", ranktmp int(10) unsigned" . // use this while we're updating the rank every night
", other boolean not null" . // is this URL from some other source and we should crawl it EVERY time
", optout boolean not null" . // did the website owner ask us NOT to crawl their site
", primary key (urlOrig(255))" .
", index(urlhash)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create CDF table.
if ( ! tableExists($gCdfTable) ) {
$command = "create table $gCdfTable (" .
"crawlid int unsigned not null" . // crawlid joins with "crawls" table
", slice varchar (32) not null" . // "Top100", "Top1000", "intersection", or "All"
", stat varchar (32) not null"; // the stat for this CDF (eg, PageSpeed, or bytesTotal)
// add columns for all percentages 1-100
for ( $i = 1; $i <= 100; $i++ ) {
$command .= ", percentile$i int unsigned not null";
}
$command .= ", primary key (crawlid, slice, stat)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Urls Change Table
// If someone chooses to have their site removed, we queue the request here.
if ( ! tableExists($gUrlsChangeTable) ) {
$command = "create table $gUrlsChangeTable (" .
"url text character set binary" . // use "binary" so it's case *sensitive*
", action varchar(16)" . // "add", "remove" for now
", createDate int(10) unsigned not null" .
", primary key (url(255), action)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
// Create Settings Table
if ( ! tableExists($gSettingsTable) ) {
$command = "create table $gSettingsTable (" .
"setting varchar (64)" .
", val varchar (255)" .
", primary key (setting)" .
") ENGINE MyISAM;";
doSimpleCommand($command);
}
}
function createRequestsTable($tablename, $aMergeTables=null) {
global $ghReqHeaders, $ghRespHeaders;
$sColumns = "";
$aColumns = array_values($ghReqHeaders);
sort($aColumns);
for ( $i = 0; $i < count($aColumns); $i++ ) {
$column = $aColumns[$i];
$sColumns .= ", $column varchar (255)";
}
$aColumns = array_values($ghRespHeaders);
sort($aColumns);
for ( $i = 0; $i < count($aColumns); $i++ ) {
$column = $aColumns[$i];
$sColumns .= ", $column varchar (255)";
}
$command = "create table $tablename (" .
"requestid int unsigned not null auto_increment" .
", pageid int unsigned not null" .
", startedDateTime int(10) unsigned" .
", time int(10) unsigned" .
", method varchar (32)" .
", url text" .
", urlShort varchar (255)" .
", redirectUrl text" .
", firstReq tinyint(1) not null" .
", firstHtml tinyint(1) not null" .
// req
", reqHttpVersion varchar (32)" .
", reqHeadersSize int(10) unsigned" .
", reqBodySize int(10) unsigned" .
", reqCookieLen int(10) unsigned not null".
", reqOtherHeaders text" .
// response
", status int(10) unsigned" .
", respHttpVersion varchar (32)" .
", respHeadersSize int(10) unsigned" .
", respBodySize int(10) unsigned" .
", respSize int(10) unsigned" .
", respCookieLen int(10) unsigned not null".
", expAge int unsigned not null" . // number of seconds until response expires
", mimeType varchar(255)" . // slightly normalized version of resp_content_type - ~1000 distinct values
", respOtherHeaders text" .
// headers
$sColumns .
// columns added later
", _cdn_provider varchar(64)" .
", _gzip_save int(10) unsigned not null" .
", crawlid int unsigned not null" .
", type varchar (32)" . // pretty mime type
", ext varchar (8)" . // file extension, eg, .html
", format varchar (8)" . // file format - svg, flash, etc.
", primary key (requestid)" .
", index(pageid)" .
//TODO ", index(crawlid)" .
", unique key (startedDateTime, pageid, urlShort) ) " .
( $aMergeTables ?
"ENGINE=MERGE UNION=(" . implode(",", $aMergeTables) . ") INSERT_METHOD=NO;" :
"ENGINE=MyISAM;" );
doSimpleCommand($command);
}
////////////////////////////////////////////////////////////////////////////////
//
// SCHEMA CHANGES
//
// Let's try something new.
//
// We *used to* make manual changes to the tables using
// the MySQL commandline and document those in the comments for the createTables
// function. But that makes it hard for other people to apply those changes, and
// to apply changes programmatically.
//
// The *new* approach will be to write a function to apply the changes. It would
// also be nice if there was a way to intelligently apply multiple changes as
// needed - but I need to think about how to make the code reflexive.
////////////////////////////////////////////////////////////////////////////////
// === PHASE 1: Add columns for new db schema.
// DONE 1. Add "crawlid" column to pages*
// 2. Add "crawlid" column to requests*
// NO - use "device" to determine this 3. Add "devicetype" column to "crawls". Possible values are "desktop" or "mobile".
// 4. Add "browser" column to "crawls". Possible values are "IE" and "iphone".
// DONE 5. Add "notes" column to crawls for things like "JS disabled" or "faster network".
// 6. Add "crawltype" column to crawls. Possible values are "public", "hidden", "test", "ongoing". Only "public" crawls are visible through the UI.
// DONE 7. Add "crawlid" column to stats
// 8. Remove "label" indexes for "pages*". Create "crawlid" indexes.
//
// === PHASE 2: Add columns for new stats.
// 1. pages.connections (available in old HAR files)
// 2. pages.localStorage (newly added by Pat, bug 379)
// 3. pages.viewport (newly added by Pat)
// 4. pages.docWidth, pages.docHeight (newly added by Pat, bug 380)
// 5. pages.domDepth (newly added by Pat, bug 385)
// 6. pages.doctype (newly added by Pat)
// 7. requests.cdn (available in old HAR files)
// 8. requests.imgWidth, imgHeight, imgDomWidth, imgDomHeight (newly added by Pat, bug 381)
//
// === PHASE 3: Remove unnecessary columns after code is updated to use new columns.
// 1. Remove "label" and "archive" columns from pages.
// 2. Remove the "archive" column from crawls.
// 3. Remove the "archive" column from status.
// 4. Remove "label" from stats
function db_upgrade_mergeTables($table) {
db_addColumn("crawlid", $table, "int unsigned not null", "label");
return;
/*
$aPages = array("pagesdev", "pagesmobile", "pages");
$aLocations = array("IE8", "iphone4", "IE8");
for ( $i = 0; $i < count($aPages), $i++ ) {
$pagestable = $aPages[$i];
$location = $aLocations[$i];
db_pages_addCrawlidColumn($pagestable);
db_pages_setCrawlid($pagestable, $location);
}
*/
// Loop through all the individual requests tables.
// In some cases we're going to have to import this from CSV.
// In the case of mobile we'll have to create the merge tables.
$query = "select crawlid, label, location from crawls where archive = 'All' order by crawlid asc;";
$result = doQuery($query);
while ($row = mysql_fetch_assoc($result)) {
$crawlid = $row['crawlid'];
$label = $row['label'];
$location = $row['location'];
$requeststable = "requests_$crawlid";
tprint("=== $requeststable - $label");
$bDropTable = false;
if ( ! tableExists($requeststable) ) {
importCrawl2($label, "All", $location, true, false);
$bDropTable = true;
}
db_requests_addCrawlidColumn($requeststable);
db_requests_setCrawlid($requeststable);
if ( $bDropTable ) {
tprint("dropping table $requeststable");
dropTable($requeststable);
}
tprint("done $requeststable - $label\n");
tprint("CVSNO - exiting!"); exit(); //CVSNO
}
mysql_free_result($result);
}
function db_pages_addCrawlidColumn($table) {
return db_addColumn("crawlid", $table, "int unsigned not null", "label");
}
// Populate the crawlid column with the appropriate values.
function db_pages_setCrawlid($table, $location) {
if ( ! columnExists("crawlid", $table) ) {
dprint("WARNING: Bailing - column 'crawlid' does not exist in table '$table'.");
return false;
}
// The "pages" table has a "label" column. Use that to determine the crawlid value.
$cmd = "UPDATE $table as p, crawls as c SET p.crawlid = c.crawlid WHERE p.label = c.label and c.location = '$location';";
doSimpleCommand($cmd);
return true;
}
function db_requests_addCrawlidColumn($table) {
return db_addColumn("crawlid", $table, "int unsigned not null", "pageid");
}
// Populate the crawlid column with the appropriate values.
function db_requests_setCrawlid($table) {
if ( ! columnExists("crawlid", $table) ) {
dprint("WARNING: Bailing - column 'crawlid' does not exist in table '$table'.");
return false;
}
// The new "requests" tables have the crawlid in the name - just suck it out and set it.
$crawlid = substr($table, strlen("requests_"));
$cmd = "UPDATE $table SET crawlid = $crawlid;";
doSimpleCommand($cmd);
return true;
}
function db_addColumn($column, $table, $def, $columnAfter="") {
if ( db_columnMissing($column, $table) ) {
$cmd = "alter table $table add column $column $def" . ( $columnAfter ? " after $columnAfter" : "" );
doSimpleCommand($cmd);
}
return columnExists($column, $table);
}
function db_dropColumn($column, $table) {
if ( columnExists($column, $table) ) {
$cmd = "alter table $table drop column $column;";
doSimpleCommand($cmd);
}
else {
dprint("WARNING: Column '$column' does not exist for table '$table'. No need to remove it.");
}
return ! columnExists($column, $table);
}
// Helper function to check that
function db_columnMissing($column, $table) {
if ( ! tableExists($table) ) {
dprint("ERROR: Table '$table' doesn't exist.");
return false;
}
if ( columnExists($column, $table) ) {
dprint("WARNING: Bailing - column '$column' already exists in table '$table'.");
return false;
}
return true;
}
?>