# 📊 Calgary HTTP Log Analysis
This notebook contains the solution to the take-home assessment for analyzing the Calgary HTTP web server logs using Python.

In [None]:
# 📥 Step 1: Load and Read the .gz File
import gzip
log_lines = []
with gzip.open("calgary_access_log.gz", 'rt', encoding='utf-8', errors='ignore') as f:
    log_lines = f.readlines()
print("Total lines:", len(log_lines))

In [None]:
# 📦 Step 2: Parse Logs
import re
from datetime import datetime
import pandas as pd

log_pattern = re.compile(
    r'(?P<host>\S+) \S+ \S+ \[(?P<timestamp>.*?)\] "(?:GET|POST) (?P<filename>\S+) \S+" (?P<status>\d{3}) (?P<bytes>\S+)'
)

parsed_logs = []
for line in log_lines:
    match = log_pattern.match(line)
    if match:
        data = match.groupdict()
        try:
            data['datetime'] = datetime.strptime(data['timestamp'], "%d/%b/%Y:%H:%M:%S %z")
        except:
            continue
        data['bytes'] = int(data['bytes']) if data['bytes'].isdigit() else 0
        data['status'] = int(data['status'])
        parsed_logs.append(data)

df = pd.DataFrame(parsed_logs)
df.head()

## ✅ Analysis Questions

In [None]:
# Q1: Total number of log records
len(df)

In [None]:
# Q2: Unique hosts
df['host'].nunique()

In [None]:
# Q3: Date-wise unique filename counts
df['date'] = df['datetime'].dt.strftime('%d-%b-%Y')
df.groupby('date')['filename'].nunique().to_dict()

In [None]:
# Q4: Number of 404 responses
(df['status'] == 404).sum()

In [None]:
# Q5: Top 15 filenames with 404 responses
df_404 = df[df['status'] == 404]
df_404['filename'].value_counts().head(15).items()

In [None]:
# Q6: Top 15 extensions with 404 responses
df_404['extension'] = df_404['filename'].apply(lambda x: x.split('.')[-1] if '.' in x else 'none')
df_404['extension'].value_counts().head(15).items()

In [None]:
# Q7: Total bandwidth transferred per day for July 1995
july_df = df[(df['datetime'].dt.month == 7) & (df['datetime'].dt.year == 1995)]
july_df.groupby(july_df['date'])['bytes'].sum().to_dict()

In [None]:
# Q8: Hourly request distribution
df['hour'] = df['datetime'].dt.hour
df.groupby('hour').size().to_dict()

In [None]:
# Q9: Top 10 most requested filenames
df['filename'].value_counts().head(10).items()

In [None]:
# Q10: HTTP response code distribution
df['status'].value_counts().to_dict()