In [68]:
import pandas as pd
import re
from collections import defaultdict, Counter

In [30]:
access_log = 'access.log'

In [31]:
with open(access_log, 'r') as log:
    data = log.read()

In [32]:
log_pattern = re.compile(r'(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] "(\w+) .*" (\d+) .* "(.*?)"$')

In [33]:
log_entries = []

In [34]:
for line in data.strip().split('\n'):
    match = log_pattern.match(line)
    if match:
        ip_address, time, http_method, http_status, user_agent = match.groups()
        log_entries.append({
            'ip address': ip_address,
            'time': time,
            'http method': http_method,
            'http status': http_status,
            'user agent': user_agent
        })

In [35]:
df = pd.DataFrame(log_entries)

In [36]:
df.head()

Unnamed: 0,ip address,time,http method,http status,user agent
0,1.202.218.8,20/Jun/2012:19:05:12 +0200,GET,404,"\""Mozilla/5.0"
1,208.115.113.91,20/Jun/2012:19:20:16 +0200,GET,200,Mozilla/5.0 (compatible; Ezooms/1.0; ezooms.bo...
2,123.125.71.20,20/Jun/2012:19:30:40 +0200,GET,200,Mozilla/5.0 (compatible; Baiduspider/2.0; +htt...
3,220.181.108.101,20/Jun/2012:19:31:01 +0200,GET,200,Mozilla/5.0 (compatible; Baiduspider/2.0; +htt...
4,123.125.68.79,20/Jun/2012:19:53:24 +0200,GET,200,Mozilla/5.0 (compatible; Baiduspider/2.0; +htt...


In [37]:
df['time'] = pd.to_datetime(df['time'], format='%d/%b/%Y:%H:%M:%S %z')
df['date'] = df['time'].dt.date
daily_unique_users = df.groupby('date')['ip address'].nunique().reset_index()
daily_unique_users.columns = ['date', 'unique users']
daily_unique_users

Unnamed: 0,date,unique users
0,2012-06-20,21
1,2012-06-21,69
2,2012-06-22,68
3,2012-06-23,83
4,2012-06-24,78
5,2012-06-25,73
6,2012-06-26,90
7,2012-06-27,73
8,2012-06-28,88
9,2012-06-29,93


In [38]:
user_agent_ranking = df['user agent'].value_counts().reset_index()
user_agent_ranking.columns = ['user agent', 'request count']
user_agent_ranking

Unnamed: 0,user agent,request count
0,Mozilla/5.0 (compatible; Baiduspider/2.0; +htt...,556
1,Mozilla/5.0 (compatible; YandexBot/3.0; +http:...,276
2,Mozilla/5.0 (compatible; MJ12bot/v1.4.3; http:...,234
3,"\""Mozilla/5.0",130
4,Mozilla/5.0 (compatible; Googlebot/2.1; +http:...,128
...,...,...
96,Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US...,1
97,findlinks/2.1.5 (+http://wortschatz.uni-leipzi...,1
98,SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configur...,1
99,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,1


In [39]:
os = {'Windows': 0, 'Linux': 0, 'MacOS': 0, 'Android': 0, 'iOS': 0, 'Unknown': 0}
for ua in df['user agent']:
    ua_lower = ua.lower()
    if 'windows' in ua_lower:
        os['Windows'] += 1
    elif 'android' in ua_lower:
        os['Android'] += 1
    elif 'linux' in ua_lower:
        os['Linux'] += 1
    elif 'iphone' in ua_lower or 'ipad' in ua_lower:
        os['iOS'] += 1
    elif 'mac os' in ua_lower or 'macintosh' in ua_lower:
        os['MacOS'] += 1
    else:
        os['Unknown'] += 1

In [40]:
os

{'Windows': 241,
 'Linux': 11,
 'MacOS': 3,
 'Android': 8,
 'iOS': 15,
 'Unknown': 1830}

In [67]:
bots = defaultdict(int)
for ua in df['user agent']:
    ua_ = ''.join(set(ua.split())).lower()
    if 'crawl' in ua_ or 'wotbox' in ua_:
        bots[ua.split()[0]] += 1
    elif ('bot' in ua_ or 'spider' in ua_) and len(ua.split()) >= 3:
        bots[ua.split()[2]] += 1
    if 'wotbox' in ua_:
        print(ua)

Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)
Wotbox/2.0 (bot@wotbox.com; htt

In [65]:
bots

defaultdict(int,
            {'Ezooms/1.0;': 69,
             'Baiduspider/2.0;': 556,
             'YandexBot/3.0;': 276,
             'MJ12bot/v1.4.3;': 234,
             'AhrefsBot/3.0;': 2,
             'Googlebot/2.1;': 128,
             'Aboundex/0.2': 11,
             'Blekkobot;': 15,
             'Wotbox/2.0': 21,
             'Mozilla': 14,
             'spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)': 8,
             'bingbot/2.0;': 30,
             'Exabot/3.0;': 11,
             '7.0;': 2,
             'U;': 9,
             'MJ12bot/v1.4.2;': 68,
             'OpenindexSpider/Nutch-1.5-dev;': 2,
             'YandexImages/3.0;': 3,
             'aiHitBot/1.1;': 26,
             'Configuration/CLDC-1.1': 1,
             'discobot/2.0;': 5,
             'AhrefsBot/3.1;': 39,
             'Spider/Nutch-1.4': 4,
             'AcoonBot/4.11.1;': 2,
             'BacklinkCrawler': 16})

In [73]:
ip_requests = Counter(df['ip address'])
ip_requests.most_common()

[('95.108.151.244', 132),
 ('1.202.218.8', 130),
 ('178.154.210.252', 72),
 ('95.108.150.235', 72),
 ('176.31.247.216', 62),
 ('208.115.113.91', 60),
 ('89.123.3.221', 58),
 ('66.249.72.65', 56),
 ('77.222.128.221', 26),
 ('81.144.138.34', 21),
 ('72.14.199.244', 21),
 ('66.249.72.235', 21),
 ('184.154.48.82', 20),
 ('74.111.11.192', 20),
 ('83.149.126.98', 18),
 ('207.210.234.226', 16),
 ('66.249.72.4', 16),
 ('46.165.197.151', 16),
 ('108.59.8.80', 16),
 ('46.4.100.231', 16),
 ('31.11.220.254', 16),
 ('77.75.77.11', 15),
 ('66.249.72.250', 13),
 ('173.236.21.106', 12),
 ('199.58.86.211', 12),
 ('199.58.86.209', 12),
 ('108.59.8.70', 12),
 ('173.192.34.95', 11),
 ('220.181.124.140', 11),
 ('193.47.80.48', 11),
 ('89.139.13.160', 11),
 ('123.125.71.40', 10),
 ('199.87.252.66', 10),
 ('123.125.71.53', 10),
 ('173.242.125.191', 10),
 ('62.212.73.211', 10),
 ('208.115.111.75', 9),
 ('220.181.108.101', 8),
 ('109.67.210.45', 8),
 ('123.125.71.59', 8),
 ('123.125.71.34', 8),
 ('220.181.108.