#!/usr/bin/perl -w ######################################################################## # # analyze-traffic.pl # # Written by Jeffrey M. Vinocur # This work is hereby placed in the public domain by its author. # ######################################################################## # # Usage: # # 1. Add an entry in $pathetc/newsfeeds like the one below, and issue # `ctlinnd reload newsfeeds traffic` (you can change the path to # whatever you like). # # analyze!\ # :*\ # :Tf,WgsbmnN:/usr/local/news/log/traffic # # You may find it useful to restrict the articles being logged, # either by modifing the wildmat pattern, or by using the /exclude # notation to indicate articles that have passed through some # servers should not be included. # # Also, if $USE_ALL_GROUPS (see below) is false, you can leave out # the N flag, which may eliminate some parsing errors. # # 2. Wait for some data. # # 3. Run analyze-traffic.pl on the logged data (you can pass the # filename as an argument or feed the data on standard input). # You probably want to pipe it into a file, `less`, or `tail` as the # output is a line for every group that has received an article # according the input data. # # There are some options hardcoded into the script below, under # "Constants" -- check for customization, if you like. # # 4. Be sure to comment out the newsfeeds entry when done, or set # up some sort of log rotation, or INN will eventually fill up your # disk... # ######################################################################## # # Implementation notes and known bugs: # # - We try (if $USE_ALL_GROUPS is set, below) to count crossposted # towards each listed group (even ones not carried on the server!), # but since some articles have funky Newsgroups headers, that can # backfire. So parsing can fail, which usually results in the # relevant line being skipped, but occasionally can cause Perl to # issue warnings (and perhaps produce funny things in the output). # # A workaround would be to repeat e.g. the Message-ID at the end of # the intput format (i.e. WgsbmnNm), and then the script could read as # many lines as necessary until that ad hoc end-of-record marker # appeared. I haven't found a need for this yet, though. # # - The input format is a sequence of lines, each containing a number of # space-separated fields. Check newsfeeds(5) for what the semantics # are, but an example line (wrapped), for reference, looks like: # # rec.aviation.military [space] # news-out.maxwell.syr.edu [space] # 2796 [space] # <3Jvua.104184$My6.1642017@twister.tampabay.rr.com> [space] # @030247454E45524C31000016AD3100000004@ [space] # rec.aviation.military,rec.travel.usa-canada, [no space here] # sci.electronics.design,sci.econ,sci.environment # # - The output format is a sequence of lines, one for each newsgroup, # with three tab-separated fields. They are sorted by either the # second or third field, depending on $SORT_BY_SIZE, below. The first # field is the name of the newsgroup. The second is the total number # of articles appearing in that newsgroup followed by, in parentheses, # the short name of the peer (see about $TLD_REGEX below) responsible # for the most articles and the percentage it made up. The third is # the total number of kilobytes of (accepted) traffic in that # newsgroup, followed similarly by the peer responsible for the most # traffic in that group. It looks something like this: # # news.lists.filters 1057 arts (63% syr) 7105.9 KB (36% cox) # # The short names are made by taking the last component of the # (dot-separated) peer name that doesn't match /$TLD_REGEX/. The idea # is that, for example, "isc.org" would be listed as "isc", and # "demon.co.uk" would be listed as "demon". Adjust $TLD_REGEX as # needed to trim the top-level domains in your part of the world. # # If your peers have very long short names, the output may look # somewhat funny. Similar things can happen with newsgroup names, so # those longer than $FIELD1_WIDTH will be truncated to fit. (You can # set $FIELD1_WIDTH to '' to skip this truncation, in which case the # first column will not be space-padded and the output will look a bit # ragged.) # ######################################################################## # # Constants: my $USE_ALL_GROUPS = 1; # if 0, use only group article is stored under my $SORT_BY_SIZE = 1; # if 0, sort output by number of articles my $FIELD1_WIDTH = 30; # maximum length of newsgroup name, '' for none my $TLD_REGEX = '^(?:com|net|org|edu|gov|mil|ac|co|uk|au|ca|de)$'; # feel free to add any others as needed ######################################################################## use strict; my %stats; while( <> ) { my ($group, $peer, $bytes, $id, $token, @Newsgroups) = split; next unless ($USE_ALL_GROUPS ? @Newsgroups : $token); # bad input line my @groups = map { split /\s*,\s*/ } @Newsgroups; foreach ($USE_ALL_GROUPS && @groups ? @groups : $group) { my $s = $stats{$_} ||= { count => 0, bytes => 0, peers => {}, }; $s->{count}++; $s->{bytes} += $bytes; $s->{peers}->{$peer}->{count}++; $s->{peers}->{$peer}->{bytes} += $bytes; } } my $f = $SORT_BY_SIZE ? 'bytes' : 'count'; foreach (sort { $stats{$a}->{$f} <=> $stats{$b}->{$f} } (keys %stats)) { my %s = %{$stats{$_}}; my ($topcount,$topcountwho) = &max('count', $s{peers}); my ($topbytes,$topbyteswho) = &max('bytes', $s{peers}); $topcountwho = &trim($topcountwho); $topbyteswho = &trim($topbyteswho); my $countf = int(100 * $topcount / $s{count}); my $bytesf = int(100 * $topbytes / $s{bytes}); my $kb = 0.1 * int($s{bytes} * 10 / 1024); my $ng = $FIELD1_WIDTH eq '' ? $_ : substr($_,0,$FIELD1_WIDTH); print +(sprintf("%-${FIELD1_WIDTH}s\t", $ng)), "$s{count} arts ($countf% $topcountwho)\t", "${kb} KB ($bytesf% $topbyteswho)\n"; } 1; sub trim { my @parts = split(/\./, $_[0]); my $part; while( defined($part = pop(@parts)) ) { last unless $part =~ /$TLD_REGEX/o; } return defined($part) ? $part : $_[0]; } sub max { my $x = 0; my $who; my ($field, $listref) = @_; while( my ($peer, $stats) = each %{$listref} ) { if( $stats->{$field} > $x ) { $x = $stats->{$field}; $who = $peer; } } return ($x, $who); }