Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
818 lines (698 sloc) 27.3 KB
<?php
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once("utils.inc");
require_once("dbapi.inc");
$gaCdfStats = array(
"TTFB",
"renderStart",
"onContentLoaded",
"onLoad",
"fullyLoaded",
"visualComplete",
"SpeedIndex",
"reqTotal",
"reqHtml",
"reqJS",
"reqCSS",
"reqImg",
"reqGif",
"reqJpg",
"reqPng",
"reqFlash",
"reqFont",
"reqJson",
"reqOther",
"reqAudio",
"reqVideo",
"reqText",
"reqXml",
"reqWebp",
"reqSvg",
"bytesTotal",
"bytesHtml",
"bytesJS",
"bytesCSS",
"bytesImg",
"bytesGif",
"bytesJpg",
"bytesPng",
"bytesFlash",
"bytesFont",
"bytesJson",
"bytesOther",
"bytesHtmlDoc",
"bytesAudio",
"bytesVideo",
"bytesText",
"bytesXml",
"bytesWebp",
"bytesSvg",
"numDomains",
"maxDomainReqs",
"numRedirects",
"numErrors",
"numGlibs",
"numHttps",
"numCompressed",
"numDomElements",
"gzipTotal",
"gzipSavings",
"_connections",
"maxage0",
"avg_dom_depth",
"document_height",
"document_width",
"localstorage_size",
"sessionstorage_size",
"num_iframes",
"num_scripts",
"num_scripts_async",
"num_scripts_sync",
"usertiming"
);
function addStats($label, $slice, $device, $hStats=null, $crawlid=null) {
if ( getStatsData($label, $slice, $device) ) {
return true;
}
if ( ! $hStats ) {
$hStats = computeStats($label, $slice, $device);
}
return addStatsData($label, $slice, $device, $hStats, $crawlid);
}
function replaceStats($label, $slice=null, $device=null, $hStats=null, $crawlid=null) {
$device = ( $device ? $device : curDevice() );
if ( $slice ) {
if ( ! $hStats ) {
$hStats = computeStats($label, $slice, $device);
}
removeStats($label, $slice, $device);
addStatsData($label, $slice, $device, $hStats, $crawlid);
}
else {
// If $slice is NOT provided, do ALL slices (eg, "All", "Top1000", "Top100").
$aSlices = sliceNames(false);
foreach ( $aSlices as $slice ) {
$hStats = computeStats($label, $slice, $device);
removeStats($label, $slice, $device);
addStatsData($label, $slice, $device, $hStats, $crawlid);
}
}
}
function getStats($label, $slice, $device, $url=NULL) {
if ( $url ) {
return getStatsDataForUrl($label, $slice, $device, $url);
}
$hStats = getStatsData($label, $slice, $device);
if ( ! $hStats ) {
// We do NOT want to do this because it might get called in the middle of a run.
//addStats($label, $slice, $device);
//$hStats = getStatsData($label, $slice, $device);
}
return $hStats;
}
function getTrends($slice, $device, $url=NULL) {
if ( $url ) {
return getTrendsDataForUrl($url, $device);
}
else {
return getTrendsData($slice, $device);
}
}
// If $bUpdate = false then only compute stats if there are NO stats for the run (ie, the entire row is missing).
function computeMissingStats($device, $bVerbose = false, $bUpdate = false, $label = null) {
$aDevices = ( $device ? array($device) : allDevices() );
$aLabels = ( $label ? array($label) : archiveLabels(null, false, "n/j/y", true) );
$aSlices = sliceNames();
foreach ( $aDevices as $device ) {
for ( $i = count($aLabels)-1; $i >= 0; $i-- ) {
// do labels in reverse chrono order so newest are ready first
$label = $aLabels[$i];
foreach ( $aSlices as $slice ) {
if ( $bUpdate ) {
if ( $bVerbose ) { lprint("updating $label $slice $device..."); }
updateStats($label, $slice, $device);
if ( $bVerbose ) { lprint("DONE"); }
}
else if ( ! getStatsData($label, $slice, $device) ) {
if ( $bVerbose ) { lprint("adding $label $slice $device..."); }
addStats($label, $slice, $device);
if ( $bVerbose ) { lprint("DONE"); }
}
}
}
}
}
// TODO - from dbapi.inc
////////////////////////////////////////////////////////////////////////////////
//
// STATS
//
////////////////////////////////////////////////////////////////////////////////
// List of fields (including mysql massaging) for selecting data from the "pages" table and saving to the "stats" table.
$gaTrendsCompute = array(
"ROUND(AVG(onLoad)) as onLoad",
"ROUND(AVG(TTFB)) as TTFB",
"ROUND(AVG(renderStart)) as renderStart",
"ROUND(AVG(visualComplete)) as visualComplete",
"ROUND(AVG(fullyLoaded)) as fullyLoaded",
"ROUND(AVG(SpeedIndex)) as SpeedIndex",
"ROUND(AVG(bytesHtmlDoc)) as bytesHtmlDoc",
"ROUND(AVG(gzipTotal)) as gzipTotal",
"ROUND(AVG(gzipSavings)) as gzipSavings",
"ROUND(AVG(numDomElements)) as numDomElements",
"ROUND(AVG(numDomains)) as numDomains",
"ROUND(AVG(maxDomainReqs)) as maxDomainReqs",
"ROUND(AVG(reqTotal), 1) as reqTotal",
"ROUND(AVG(reqHtml), 1) as reqHtml",
"ROUND(AVG(reqJS), 1) as reqJS",
"ROUND(AVG(reqCSS), 1) as reqCSS",
"ROUND(AVG(reqImg), 1) as reqImg",
"ROUND(AVG(reqFlash), 1) as reqFlash",
//"ROUND(AVG(reqJson), 1) as reqJson", // too hard to identify based on mimeType
"ROUND(AVG(reqOther), 1) as reqOther",
"ROUND(AVG(reqFont), 1) as reqFont",
"ROUND(AVG(reqGif), 1) as reqGif",
"ROUND(AVG(reqJpg), 1) as reqJpg",
"ROUND(AVG(reqPng), 1) as reqPng",
"ROUND(AVG(reqAudio), 1) as reqAudio",
"ROUND(AVG(reqVideo), 1) as reqVideo",
"ROUND(AVG(reqText), 1) as reqText",
"ROUND(AVG(reqXml), 1) as reqXml",
"ROUND(AVG(reqWebp), 1) as reqWebp",
"ROUND(AVG(reqSvg), 1) as reqSvg",
"ROUND(AVG(bytesTotal)) as bytesTotal",
"ROUND(AVG(bytesHtml)) as bytesHtml",
"ROUND(AVG(bytesJS)) as bytesJS",
"ROUND(AVG(bytesCSS)) as bytesCSS",
"ROUND(AVG(bytesImg)) as bytesImg",
"ROUND(AVG(bytesFlash)) as bytesFlash",
//"ROUND(AVG(bytesJson)) as bytesJson", // too hard to identify based on mimeType
"ROUND(AVG(bytesOther)) as bytesOther",
"ROUND(AVG(bytesFont)) as bytesFont",
"ROUND(AVG(bytesGif)) as bytesGif",
"ROUND(AVG(bytesJpg)) as bytesJpg",
"ROUND(AVG(bytesPng)) as bytesPng",
"ROUND(AVG(bytesAudio)) as bytesAudio",
"ROUND(AVG(bytesVideo)) as bytesVideo",
"ROUND(AVG(bytesText)) as bytesText",
"ROUND(AVG(bytesXml)) as bytesXml",
"ROUND(AVG(bytesWebp)) as bytesWebp",
"ROUND(AVG(bytesSvg)) as bytesSvg",
// columns added later
"ROUND(AVG(_connections)) as _connections",
"ROUND(AVG(avg_dom_depth)) as avg_dom_depth",
"ROUND(AVG(document_height)) as document_height",
"ROUND(AVG(document_width)) as document_width",
"ROUND(AVG(localstorage_size)) as localstorage_size",
"ROUND(AVG(sessionstorage_size)) as sessionstorage_size",
"ROUND(AVG(num_iframes)) as num_iframes",
"ROUND(AVG(num_scripts)) as num_scripts",
"ROUND(AVG(num_scripts_async)) as num_scripts_async",
"ROUND(AVG(num_scripts_sync)) as num_scripts_sync",
"ROUND(AVG(usertiming)) as usertiming"
);
// List of fields (including mysql massaging) to extract from the "stats" table to show as trend charts.
// This list is used for labels AND for individual websites.
$gaTrendsGet = array(
"onLoad",
"TTFB",
"renderStart",
"visualComplete",
"fullyLoaded",
"SpeedIndex",
"ROUND(bytesHtmlDoc/1024) as bytesHtmlDoc",
// gzipTotal?
// gzipSavings?
"numDomElements",
"numDomains",
"maxDomainReqs",
"maxage0",
"_connections",
"avg_dom_depth",
"num_iframes",
"num_scripts",
"num_scripts_async",
"num_scripts_sync",
"usertiming",
"ROUND(reqTotal) as reqTotal", // if there are A LOT of requests, do ROUND
"ROUND(reqHtml, 1) as reqHtml", // if there are A FEW requests, show a float
"ROUND(reqJS) as reqJS",
"ROUND(reqCSS, 1) as reqCSS",
"ROUND(reqImg) as reqImg",
"ROUND(reqFlash, 1) as reqFlash",
"ROUND(reqFont, 1) as reqFont",
"ROUND(reqOther, 1) as reqOther",
"ROUND(bytesTotal/1024) as bytesTotal",
"ROUND(bytesHtml/1024) as bytesHtml",
"ROUND(bytesJS/1024) as bytesJS",
"ROUND(bytesCSS/1024) as bytesCSS",
"ROUND(bytesImg/1024) as bytesImg",
"ROUND(bytesFlash/1024) as bytesFlash",
"ROUND(bytesFont/1024) as bytesFont",
"ROUND(bytesOther/1024) as bytesOther"
);
// List of fields (including mysql massaging) to extract from the "stats" table to show as trend charts.
// This list is used for labels only (NOT websites);
$gaTrendsGetLabel = array(
"perFlash",
"perFonts",
"perGlibs",
"perErrors",
"perHttps",
"perCompressed",
"perRedirects",
"perCdn"
);
// return a hash of values for a single stats record
function addStatsData($label, $slice, $device, $hTuples, $crawlid=null) {
$hTuples["label"] = $label;
$hTuples["slice"] = $slice;
$hTuples["device"] = $device;
if ( $crawlid ) {
$hTuples["crawlid"] = $crawlid;
}
$statstable = statsTable($label, $slice, $device);
$cmd = "replace into $statstable set " . hashImplode(", ", "=", $hTuples) . ";";
doSimpleCommand($cmd);
return true;
}
// return a hash of values for a single stats record
function getStatsData($label, $slice, $device) {
$statstable = statsTable($label, $slice, $device);
$query = "select * from $statstable where label='$label' and slice='$slice' and " . deviceWhere($device) . ";";
$row = doRowQuery($query);
return $row;
}
// return a hash of values for a single CDF record
function getCdfData($label, $slice, $device) {
global $gCdfTable;
$crawl = getCrawl($label, null, $device);
$query = "select * from $gCdfTable where crawlid={$crawl['crawlid']} and slice='$slice';";
$result2 = doQuery($query);
$hCdf = array();
while ( $row2 = mysql_fetch_array($result2, MYSQL_NUM) ) {
// This is rickety. We assume the columns are returned in a certain order.
$stat = $row2[2];
$hCdf["cdf_$stat"] = implode(",", array_slice($row2, 3));
}
mysql_free_result($result2);
return $hCdf;
}
// return a hash of values for a single stats record
function getStatsDataForUrl($label, $slice, $device, $url) {
$pagesTable = pagesTable($label, $slice, $device);
// Get the info from the pages table.
$query = "select * from $pagesTable where label='$label' and url='$url';";
$hStats = doRowQuery($query);
// Add other derived info.
$pageidCond = "pageid=" . $hStats['pageid'];
$totalRequests = $hStats['reqTotal'];
// https
$hStats['perHttps'] = round(100*$hStats['numHttps']/$totalRequests);
// max-age caching - convert absolute count to percentage
$hStats['maxageNull'] = round(100 * $hStats['maxageNull']/$totalRequests);
$hStats['maxage0'] = round(100 * $hStats['maxage0'] / $totalRequests);
$hStats['maxage1'] = round(100 * $hStats['maxage1'] / $totalRequests);
$hStats['maxage30'] = round(100 * $hStats['maxage30'] / $totalRequests);
$hStats['maxage365'] = round(100 * $hStats['maxage365'] / $totalRequests);
$hStats['maxageMore'] = round(100 * $hStats['maxageMore'] / $totalRequests);
return $hStats;
}
// Returns a hash where the key is the label (run name, eg, "Oct 1 2011").
// The value for each key is another hash of all the trends data fields.
function getTrendsData($slice, $device) {
global $gaTrendsGet, $gaTrendsGetLabel, $gCdfTable, $gCrawlsTable, $gPercentile;
$statsTable = statsTable(NULL, $slice, $device);
// TODO - This hardcoded list of DB fields must be synced with the list in trends.inc.
$query = "select label" .
", numurls" .
", " . implode(", ", $gaTrendsGet) .
", " . implode(", ", $gaTrendsGetLabel) .
" from $statsTable where slice='$slice' and " . deviceWhere($device) . ";";
$result = doQuery($query);
$hTrends = array();
while ( $row = mysql_fetch_assoc($result) ) {
$hTrends[$row['label']] = $row;
}
mysql_free_result($result);
// add CDF data
if ( $gPercentile ) {
$query = "select crawls.crawlid as crawlid, label, stat, percentile$gPercentile as value from $gCdfTable as cdf, $gCrawlsTable as crawls where cdf.crawlid=crawls.crawlid and slice='$slice' and location='$device';";
$result = doQuery($query);
while ( $row = mysql_fetch_assoc($result) ) {
// We're adding keys like "reqTotal50" to $hTrends:
$label = $row['label'];
$stat = $row['stat'];
$value = $row['value'];
// We need the units to match between these CDF stats and the stats in the "pages" table,
// so we have to convert bytes to KB here.
$hTrends[$label]["$stat$gPercentile"] = ( 0 === strpos($stat, "bytes") ? round($value/1024) : $value );
}
mysql_free_result($result);
}
return $hTrends;
}
// Returns a hash where the key is the label (run name, eg, "Oct 1 2011").
// The value for each key is another hash of all the trends data fields.
// WARNING: This is slow and will get slower as we get more data!
// It searches through EVERY RUN!
// ideas:
// - Add a "urlhash" field to the pages table (same as urls table) - right now we're searching on "url".
// - Cache trending data per URL - that's a big table.
function getTrendsDataForUrl($url, $device) {
global $gaTrendsGet;
$pagesTable = pagesTable(NULL, NULL, $device);
$query = "select label" .
", count(*) as numurls" .
", " . implode(", ", $gaTrendsGet) .
" from $pagesTable where " . getUrlhashCond($url) . " and url = '$url' group by label;";
$result = doQuery($query);
$hTrends = array();
while ( $row = mysql_fetch_assoc($result) ) {
$hTrends[$row['label']] = $row;
}
mysql_free_result($result);
return $hTrends;
}
// return a hash of values for a single stats record
function removeStats($label, $slice=NULL, $device=NULL) {
$aWhere = array();
if ( $label ) {
$aWhere[] = "label='$label'";
}
if ( $slice ) {
$aWhere[] = "slice='$slice'";
}
if ( $device ) {
$aWhere[] = "device='$device'";
}
$sWhere = implode(" and ", $aWhere);
if ( ! $sWhere ) {
tprint("ERROR: need to specificy paraeters to removeStats.\n");
return NULL;
}
$statstable = statsTable($label, $slice, $device);
$cmd = "delete from $statstable where $sWhere" . ";";
doSimpleCommand($cmd);
return true;
}
// Fill in any missing stats - This is complex given the ease of computing SOME stats
// while other stats are VERY time consuming to compute.
// return a hash of stats
function updateStats($label, $slice, $device) {
// Get any existing stats (potentially old with gaps, possibly missing altogether).
$hStats = getStatsData($label, $slice, $device);
if ( ! $hStats ) {
// No stats! Re-compute EVERYTHING!
$hStats = computeStats($label, $slice, $device);
}
else {
$sliceCond = sliceCond($label, $slice, $device); // this might be expensive - compute it once
$newStats = computeTrendStats($label, $slice, $device, $sliceCond);
// TODO - We don't try to update these for now.
//$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "onLoad");
//$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "renderStart");
//$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "SpeedIndex");
computeOther($label, $slice, $device, $sliceCond, $hStats);
computeCdfForCrawl($label);
$hStats = array_merge($hStats, $newStats); // $newStats will overwrite $hStats for duplicate keys
}
return addStatsData($label, $slice, $device, $hStats);
}
// return a hash of stats
// (FYI - we NEVER enter here with $slice = "url")
function computeStats($label, $slice, $device) {
$hStats = array();
$sliceCond = sliceCond($label, $slice, $device); // this might be expensive - compute it once
$hStats += computeTrendStats($label, $slice, $device, $sliceCond);
$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "onLoad");
$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "renderStart");
$hStats += computeCorrelations($label, $slice, $device, $sliceCond, "SpeedIndex");
$hStats += computeOther($label, $slice, $device, $sliceCond);
computeCdfForCrawl($label);
return $hStats;
}
function computeTrendStats($label, $slice, $device, $sliceCond) {
global $gArchive, $gaTrendsCompute;
$aFields = $gaTrendsCompute;
$pagesTable = pagesTable($label, $slice, $device);
$query = "select count(*) as numurls, " .
implode(", ", $gaTrendsCompute) .
" from $pagesTable where archive = '$gArchive' and label='$label' and $sliceCond;";
$row = doRowQuery($query);
return $row;
}
function computeCorrelations($label, $slice, $device, $sliceCond, $var1) {
$pagesTable = pagesTable($label, $slice, $device);
// TODO - make this more flexible
$hCC = array(); // hash key is the CC *VALUE*
$aVars = array("num_scripts_async", "num_scripts_sync", "num_scripts", "(num_scripts_sync+reqCss)", "(num_scripts_async+reqCss)", "(num_scripts+reqCss)", "(reqJS+reqCss)", "reqTotal", "reqHtml", "reqJS", "reqCSS", "reqImg", "reqGif", "reqJpg", "reqPng", "reqFont", "reqOther", "bytesTotal", "bytesHtml", "bytesJS", "bytesCSS", "bytesImg", "bytesGif", "bytesJpg", "bytesPng", "bytesFont", "bytesOther", "bytesHtmlDoc", "numDomains", "maxDomainReqs", "numRedirects", "numErrors", "numHttps", "numCompressed", "numDomElements", "maxage0");
foreach ($aVars as $var2) {
// from http://www.freeopenbook.com/mysqlcookbook/mysqlckbk-chp-13-sect-6.html
$cmd = "SELECT @n := COUNT($var1) AS n, @sumX := SUM($var2) AS 'sumX', @sumXX := SUM($var2*$var2) AS 'sumXX', @sumY := SUM($var1) AS 'sumY', @sumYY := SUM($var1*$var1) AS 'sumYY', @sumXY := SUM($var2*$var1) AS 'sumXY' FROM $pagesTable where $sliceCond and $var2 is not null and $var2 > 0;";
$row = doRowQuery($cmd);
$n = $row['n'];
if ( $n ) {
$sumX = $row['sumX'];
$sumXX = $row['sumXX'];
$sumY = $row['sumY'];
$sumYY = $row['sumYY'];
$sumXY = $row['sumXY'];
$denominator = sqrt( (($n*$sumXX) - ($sumX*$sumX)) * (($n*$sumYY) - ($sumY*$sumY)) );
if ( $denominator ) { // avoid divide by zero errors
$cc = (($n*$sumXY) - ($sumX*$sumY)) / $denominator;
// I want to sort the results by correlation coefficient ($cc),
// so I use $cc as the hash key. But, $cc is not unique
// (it's possible for two variables to have the same $cc).
// So the value for each hash entry is an array of variable name(s).
if ( ! array_key_exists("$cc", $hCC) ) {
$hCC["$cc"] = array();
}
array_push($hCC["$cc"], $var2);
}
}
}
// Take the CC values(!) in descending order and return the top 5 results.
$aCC = array_keys($hCC);
rsort($aCC, SORT_NUMERIC);
$iRows = 0;
$hTuples = array();
foreach($aCC as $cc) {
$prettyCC = round($cc*100)/100;
foreach($hCC[$cc] as $var2) {
$iRows++;
$hTuples[$var1 . "ccf$iRows"] = $var2; // "ccf" == Correlation Coefficient Field (db column)
$hTuples[$var1 . "ccv$iRows"] = $prettyCC; // "ccv" == Correlation Coefficient Value
if ( 5 <= $iRows ) {
break;
}
}
if ( 5 <= $iRows ) {
break;
}
}
return $hTuples;
}
// "Special" stats that often require a more complex query against the requests table.
function computeOther($label, $slice, $device, $sliceCond, &$hTuples = array()) {
$pagesTable = pagesTable($label, $slice, $device);
// total page & request count so we can do percentages
$totalPages = doSimpleQuery("select count(*) from $pagesTable where $sliceCond;");
$totalRequests = doSimpleQuery("select sum(reqTotal) from $pagesTable where $sliceCond;");
if ( ! $totalPages ) {
return $hTuples;
}
// redirects
if ( !array_key_exists('perRedirects', $hTuples) || null === $hTuples['perRedirects'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and numRedirects > 0;");
$hTuples['perRedirects'] = round(100*$num/$totalPages);
}
// errors
if ( !array_key_exists('perErrors', $hTuples) || null === $hTuples['perErrors'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and numErrors > 0;");
$hTuples['perErrors'] = round(100*$num/$totalPages);
}
// flash usage
if ( !array_key_exists('perFlash', $hTuples) || null === $hTuples['perFlash'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and reqFlash > 0;");
$hTuples['perFlash'] = round(100*$num/$totalPages);
}
// custom fonts
if ( !array_key_exists('perFonts', $hTuples) || null === $hTuples['perFonts'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and reqFont > 0;");
$hTuples['perFonts'] = round(100*$num/$totalPages);
}
// Google Ajax Libraries
if ( !array_key_exists('perGlibs', $hTuples) || null === $hTuples['perGlibs'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and numGlibs > 0;");
$hTuples['perGlibs'] = round(100*$num/$totalPages);
}
// https
if ( !array_key_exists('perHttps', $hTuples) || null === $hTuples['perHttps'] ) {
$num = doSimpleQuery("select sum(numHttps) from $pagesTable where $sliceCond;");
$hTuples['perHttps'] = round(100*$num/$totalRequests);
}
// max-age caching - percent of requests in each histogram bin
if ( !array_key_exists('maxageNull', $hTuples) || null === $hTuples['maxageNull'] ) {
$row = doRowQuery("select sum(maxageNull) as maxageNull, sum(maxage0) as maxage0, sum(maxage1) as maxage1, sum(maxage30) as maxage30, sum(maxage365) as maxage365, sum(maxageMore) as maxageMore from $pagesTable where $sliceCond;");
$hTuples['maxageNull'] = round(100 * $row['maxageNull'] /$totalRequests);
$hTuples['maxage0'] = round(100 * $row['maxage0'] / $totalRequests);
$hTuples['maxage1'] = round(100 * $row['maxage1'] / $totalRequests);
$hTuples['maxage30'] = round(100 * $row['maxage30'] / $totalRequests);
$hTuples['maxage365'] = round(100 * $row['maxage365'] / $totalRequests);
$hTuples['maxageMore'] = round(100 * $row['maxageMore'] / $totalRequests);
}
// gzip
if ( !array_key_exists('perCompressed', $hTuples) || null === $hTuples['perCompressed'] ) {
// This could be improved by including compressed fonts, excluding compressed binaries (eg images), and excluding files under 1K.
$query = "select sum(numCompressed) as compressed, sum(reqHtml) + sum(reqJS) + sum(reqCSS) + sum(reqJson) as candidates from $pagesTable where $sliceCond;";
$row = doRowQuery($query);
$hTuples['perCompressed'] = round(100 * $row['compressed'] / $row['candidates']);
}
// CDN
if ( !array_key_exists('perCdn', $hTuples) || null === $hTuples['perCdn'] ) {
$num = doSimpleQuery("select count(*) from $pagesTable where $sliceCond and cdn is not null;");
$hTuples['perCdn'] = round(100*$num/$totalPages);
}
return $hTuples;
}
// Helper function to fix a crawl's expAge values for all requests.
function updateCrawlExpAge($label) {
global $gRequestsTable;
$crawl = getCrawl($label);
if ( $crawl ) {
$query = "select requestid, expAge, resp_cache_control, resp_expires, resp_date, startedDateTime from $gRequestsTable where pageid >= {$crawl['minPageid']} and pageid <= {$crawl['maxPageid']};";
$result = doQuery($query);
while ($row = mysql_fetch_assoc($result)) {
$newExpAge = computeRequestExpAge($row['resp_cache_control'], $row['resp_expires'], $row['resp_date'], $row['startedDateTime']);
if ( $newExpAge != $row['expAge'] ) {
$cmd = "update $gRequestsTable set expAge = $newExpAge where requestid = {$row['requestid']};";
doSimpleCommand($cmd);
}
}
mysql_free_result($result);
}
}
// Helper function to fix a request's expAge value
function computeRequestExpAge($cc, $resp_expires, $resp_date, $startedDateTime) {
$expAge = 0;
if ( $cc &&
( FALSE !== stripos($cc, "must-revalidate") || FALSE !== stripos($cc, "no-cache") || FALSE !== stripos($cc, "no-store") ) ) {
// These directives dictate the response can NOT be cached.
$expAge = 0;
}
else if ( $cc && FALSE !== ($posMaxage = stripos($cc, "max-age=")) ) {
$expAge = intval(substr($cc, $posMaxage+8));
}
else if ( $resp_expires ) {
// According to RFC 2616 ( http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.2.4 ):
// freshness_lifetime = expires_value - date_value
// If the Date: response header is present, we use that.
// Otherwise we fall back to $startedDateTime which is based on the client so might suffer from clock skew.
$startEpoch = ( $resp_date ? strtotime($resp_date) : $startedDateTime );
$expAge = strtotime($resp_expires) - $startEpoch;
}
return $expAge;
}
// Helper function to fix a crawl's maxage* values.
function updateCrawlMaxage($label) {
global $gPagesTable;
$result = doQuery("select pageid from $gPagesTable where label='$label';");
while ($row = mysql_fetch_assoc($result)) {
updatePageMaxage($row['pageid']);
}
mysql_free_result($result);
}
// Helper function to fix a page's maxage* values.
function updatePageMaxage($pageid) {
global $gPagesTable, $gRequestsTable;
$result = doQuery("select expAge from $gRequestsTable where pageid = $pageid;");
$maxageNull = $maxage0 = $maxage1 = $maxage30 = $maxage365 = $maxageMore = 0;
while ($row = mysql_fetch_assoc($result)) {
// count expiration windows
$expAge = $row['expAge'];
$daySecs = 24*60*60;
if ( NULL === $expAge ) {
$maxageNull++;
}
else if ( 0 === intval($expAge) ) {
$maxage0++;
}
else if ( $expAge <= (1 * $daySecs) ) {
$maxage1++;
}
else if ( $expAge <= (30 * $daySecs) ) {
$maxage30++;
}
else if ( $expAge <= (365 * $daySecs) ) {
$maxage365++;
}
else {
$maxageMore++;
}
}
mysql_free_result($result);
$cmd = "UPDATE $gPagesTable SET " .
" maxageNull = $maxageNull" .
", maxage0 = $maxage0" .
", maxage1 = $maxage1" .
", maxage30 = $maxage30" .
", maxage365 = $maxage365" .
", maxageMore = $maxageMore" .
" where pageid = $pageid;";
doSimpleCommand($cmd);
}
function computeCdfForCrawl($label) {
$crawl = getCrawl($label);
$aSlices = array("All", "Top1000", "Top100");
foreach( $aSlices as $slice ) {
computeCdf($crawl, $slice);
}
}
// Update the "cdf" table for the desired CDF stats.
function computeCdf($crawl, $slice) {
global $gaCdfStats, $gCdfTable, $gPagesTable;
$crawlid = $crawl['crawlid'];
$label = $crawl['label'];
$device = $crawl['location'];
$sliceCond = sliceCond($label, $slice, $device);
$query = "select count(*) as num from $gPagesTable where $sliceCond;";
$numUrls = doSimpleQuery($query);
if ( 0 == $numUrls ) {
echo "ERROR: no URLs for query: $query\n";
return;
}
foreach ( $gaCdfStats as $stat ) {
$query = "select $stat from $gPagesTable where $sliceCond order by $stat asc;";
$result = doQuery($query);
$nRows = mysql_num_rows($result);
if ( $nRows != $numUrls ) {
echo "WARNING: Row count differs for \"$stat\" for slice \"$slice\": should be $numUrls but only $nRows rows found.\n $query\n\n";
}
$aTuples = array();
for ( $i = 1; $i <= 100; $i++ ) {
// skip to the Nth row in the results that corresponds to this percentile
// TODO - if N is between rows we should really take an average of the two adjacent values
$limit = ceil($i * $nRows / 100) - 1;
mysql_data_seek($result, $limit);
if ( $row = mysql_fetch_array($result) ) {
$val = ( $row[0] ? $row[0] : "0" );
array_push($aTuples, "percentile$i = $val");
}
}
mysql_free_result($result);
$cmd = "REPLACE INTO $gCdfTable SET crawlid=$crawlid, slice='$slice', stat='$stat', " . implode(", ", $aTuples) . ";";
doSimpleCommand($cmd);
}
}
?>