In [1]:
import os
import sys
import re
from time import process_time
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from apachelogs import LogParser

In [2]:
data_dir = Path('/Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/')
column_names = ['ip', 'identd', 'userid', 'date', 'time', 'timezone', 'request', 'status', 'size', 'referer', 'user_agent']
log_files = [data_dir / f for f in os.listdir(data_dir)].sort()
df_logs = pd.DataFrame(columns=column_names)
parser = LogParser("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"")

In [3]:
df_http_logs = pd.DataFrame(columns=column_names)
log_files = sorted([data_dir / f for f in os.listdir(data_dir)])
log_files = log_files[:31] # Just deal with a month's worth for now.

In [4]:
def parse_line(line):
    """Parse a line from log and return a list with log entries in CLF order"""
    parsed = parser.parse(line)
    datetime = parsed.request_time
    parsed_line = [parsed.remote_host, 
                   parsed.remote_logname,
                   parsed.remote_user,
                   datetime.date(),
                   datetime.time(),
                   datetime.tzinfo,
                   parsed.request_line, 
                   parsed.final_status,
                   parsed.bytes_sent,
                   parsed.headers_in["Referer"],
                   parsed.headers_in["User-Agent"]
                  ]
    return parsed_line

In [None]:
%%time
# Iterate through logs to create dataframe
for file in log_files:
    log_file = open(file)
    log_entries = []
    lines = log_file.readlines()
    
    count = 0
    tick = process_time()
    for line in lines:
        parsed = parser.parse(line)
        datetime = parsed.request_time
        parsed_line = parse_line(line)
        log_entries.append(parsed_line)
    df_logs = pd.concat([df_logs, pd.DataFrame(log_entries, columns=column_names)])
    print(f"completed processing file {file}")

In [5]:
%%time

for file in log_files:
    log_file = open(log_files[0])
    lines = log_file.readlines()
    log_entries = list(map(parse_line, lines))
    df_logs = pd.concat([df_logs, pd.DataFrame(log_entries, columns=column_names)])
    print(f"Completed processing file {file}")

Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-01.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-02.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-03.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-04.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-05.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-06.log
Completed processing file /Users/kaipak/datasets/pds/pds_logs/report_service/logs/final/img/img-pdsimage-http/ssl_access_2020-12-07.log
Completed processing file /Users/kaipak/datasets

In [6]:
df_logs.shape

(2442025, 11)

In [8]:
df_logs.describe()

Unnamed: 0,ip,identd,userid,date,time,timezone,request,status,size,referer,user_agent
count,2442025,0.0,0.0,2442025,2442025,2442025,2442025,2442025,1745393.0,136493,2427672
unique,3077,0.0,0.0,2,49648,1,70811,8,13933.0,749,1573
top,64.62.202.73,,,2020-12-01,19:50:16,UTC-07:00,GET /favicon.ico HTTP/1.1,200,5200.0,https://www.google.com/,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
freq,520087,,,2024765,372,2442025,14260,1668513,100409.0,79918,660517
