In [2]:

import re
from datetime import datetime
import pandas as pd
import seaborn as sns

log_pattern = re.compile(
    r'^(?P<host>\S+) \S+ \S+ \[(?P<timestamp>[^\]]+)\] "(?P<request>[^"]*)" (?P<status>\d{3}) (?P<bytes>\S+)$'
)

parsed_logs = []

with open("calgary_access_log", "r", encoding="latin1") as f:
    for line in f:
        match = log_pattern.match(line)
        if not match:
            continue  # skip malformed lines

        data = match.groupdict()

        # Handle request line
        try:
            method, filename, protocol = data["request"].split()
        except ValueError:
            filename = ""  # malformed request
        data["filename"] = filename

        # Convert fields
        try:
            data["status"] = int(data["status"])
        except:
            data["status"] = None

        try:
            data["bytes"] = int(data["bytes"]) if data["bytes"].isdigit() else 0
        except:
            data["bytes"] = 0

        try:
            data["datetime"] = datetime.strptime(data["timestamp"], "%d/%b/%Y:%H:%M:%S %z")
        except:
            continue  # skip if timestamp parsing fails

        parsed_logs.append(data)

# Load to DataFrame
df = pd.DataFrame(parsed_logs)
df["datetime"] = pd.to_datetime(df["datetime"],utc=True)

# Extract file extension
df["extension"] = df["filename"].str.extract(r'\.([a-zA-Z0-9]+)$')

# Extract date string
df["date_str"] = df["datetime"].dt.strftime('%d-%b-%Y')

# Extract hour used for 8th question
df["hour"] = df["datetime"].dt.hour

# Data Cleaning

# checking datatypes
print(df.info())

# handling duplicates
df.drop_duplicates()

# handling null values

df.dropna()

df.describe()




FileNotFoundError: [Errno 2] No such file or directory: 'calgary_access_log'

In [11]:
def total_log_records() -> int:
    
    totalrecords = len(df)
    return totalrecords  


answer1 = total_log_records()
print("Answer 1:")
print(answer1)

Answer 1:
722270


In [13]:
def unique_host_count() -> int:
    
    count = df["host"].nunique()
    return count  


answer2 = unique_host_count()
print("Answer 2:")
print(answer2)

Answer 2:
2


In [12]:
def datewise_unique_filename_counts() -> dict[str, int]:
    
    result = df.groupby("date_str")["filename"].nunique().to_dict()
    return result  


answer3 = datewise_unique_filename_counts()
print("Answer 3:")
print(answer3)

Answer 3:
{'01-Apr-1995': 407, '01-Aug-1995': 664, '01-Dec-1994': 244, '01-Feb-1995': 571, '01-Jan-1995': 82, '01-Jul-1995': 406, '01-Jun-1995': 569, '01-Mar-1995': 520, '01-May-1995': 433, '01-Nov-1994': 437, '01-Oct-1995': 584, '01-Sep-1995': 446, '02-Apr-1995': 395, '02-Aug-1995': 786, '02-Dec-1994': 357, '02-Feb-1995': 620, '02-Jan-1995': 128, '02-Jul-1995': 363, '02-Jun-1995': 547, '02-Mar-1995': 677, '02-May-1995': 695, '02-Nov-1994': 423, '02-Oct-1995': 840, '02-Sep-1995': 307, '03-Apr-1995': 814, '03-Aug-1995': 660, '03-Dec-1994': 152, '03-Feb-1995': 567, '03-Jan-1995': 258, '03-Jul-1995': 461, '03-Jun-1995': 398, '03-Mar-1995': 503, '03-May-1995': 566, '03-Nov-1994': 451, '03-Oct-1995': 834, '03-Sep-1995': 208, '04-Apr-1995': 863, '04-Aug-1995': 740, '04-Dec-1994': 212, '04-Feb-1995': 451, '04-Jan-1995': 314, '04-Jul-1995': 492, '04-Jun-1995': 324, '04-Mar-1995': 365, '04-May-1995': 701, '04-Nov-1994': 417, '04-Oct-1995': 911, '04-Sep-1995': 310, '05-Apr-1995': 830, '05-Aug-19

In [46]:
def count_404_errors() -> int:
    
    
    df404 = df[df["status"]==404]
    count = len(df404)
    return count  


answer4 = count_404_errors()
print("Answer 4:")
print(answer4)

Answer 4:
23517


In [27]:
def top_15_filenames_with_404() -> list[tuple[str, int]]:
    
   
    df404 = df[df["status"] == 404]
    result = df404.groupby("filename")["status"].count().to_dict()
    result1 = sorted(result.items(), key=lambda x: x[1], reverse=True)[:15]
    return result1 


answer5 = top_15_filenames_with_404()
print("Answer 5:")
print(answer5)

Answer 5:
[('index.html', 4694), ('4115.html', 902), ('1611.html', 649), ('5698.xbm', 585), ('710.txt', 408), ('2002.html', 258), ('2177.gif', 193), ('10695.ps', 161), ('6555.html', 153), ('487.gif', 152), ('151.html', 149), ('3414.gif', 148), ('40.html', 148), ('488.gif', 148), ('9678.gif', 142)]


In [26]:
def top_15_ext_with_404() -> list[tuple[str, int]]:
    

    df404 = df[df["status"] == 404]
    result = df404.groupby("extension")["status"].count().to_dict()
    result1 = sorted(result.items(), key=lambda x: x[1], reverse=True)[:15]
    return result1  


answer6 = top_15_ext_with_404()
print("Answer 6:")
print(answer6)

Answer 6:
[('html', 12142), ('gif', 7202), ('xbm', 824), ('ps', 754), ('jpg', 520), ('txt', 496), ('GIF', 135), ('htm', 107), ('cgi', 77), ('com', 45), ('Z', 41), ('dvi', 40), ('ca', 36), ('hmtl', 30), ('util', 29)]


In [50]:
def total_bandwidth_per_day() -> dict[str, int]:
    
    july_df = df[df['datetime'].dt.strftime('%b-%Y') == 'Jul-1995']
    df_bytes = july_df[july_df['bytes'] > 0]
    result = df_bytes.groupby("date_str")["bytes"].sum().to_dict()
    return result  


answer7 = total_bandwidth_per_day()
print("Answer 7:")
print(answer7)

Answer 7:
{'01-Jul-1995': 17002716, '02-Jul-1995': 7895368, '03-Jul-1995': 11741216, '04-Jul-1995': 25070338, '05-Jul-1995': 22468066, '06-Jul-1995': 20421399, '07-Jul-1995': 9566244, '08-Jul-1995': 5475250, '09-Jul-1995': 4312672, '10-Jul-1995': 13199230, '11-Jul-1995': 22699447, '12-Jul-1995': 17861622, '13-Jul-1995': 15964302, '14-Jul-1995': 16145982, '15-Jul-1995': 17902136, '16-Jul-1995': 8099828, '17-Jul-1995': 18428308, '18-Jul-1995': 17948048, '19-Jul-1995': 16170122, '20-Jul-1995': 25504932, '21-Jul-1995': 25944849, '22-Jul-1995': 6267473, '23-Jul-1995': 10131132, '24-Jul-1995': 20568058, '25-Jul-1995': 23300705, '26-Jul-1995': 26685445, '27-Jul-1995': 22954650, '28-Jul-1995': 37455984, '29-Jul-1995': 16293639, '30-Jul-1995': 21157883, '31-Jul-1995': 29865303}


In [43]:
def hourly_request_distribution() -> dict[int, int]:
    
    result = df.groupby("hour")["request"].count().to_dict()

    return result  

answer8 = hourly_request_distribution()
print("Answer 8:")
print(answer8)

Answer 8:
{0: 39610, 1: 32685, 2: 30736, 3: 28168, 4: 26027, 5: 22853, 6: 19868, 7: 17079, 8: 13879, 9: 11438, 10: 10574, 11: 10432, 12: 12319, 13: 15191, 14: 22093, 15: 30921, 16: 38019, 17: 46323, 18: 45768, 19: 50058, 20: 51160, 21: 52919, 22: 50514, 23: 46202}


In [41]:
def top_10_most_requested_filenames() -> list[tuple[str, int]]:
    
    result = df.groupby("filename")["request"].count().to_dict()
    result1 = sorted(result.items(),key = lambda x :x[1],reverse=True)[:10]
    return result1  


answer9 = top_10_most_requested_filenames()
print("Answer 9:")
print(answer9)

Answer 9:
[('index.html', 139528), ('3.gif', 24006), ('2.gif', 23595), ('4.gif', 8018), ('244.gif', 5148), ('5.html', 5010), ('4097.gif', 4874), ('8870.jpg', 4492), ('6733.gif', 4278), ('8472.gif', 3843)]


In [38]:
def response_code_distribution() -> dict[int, int]:
    
    result = df.groupby("status").size().to_dict()
    print(result)
    return {}  


answer10 = response_code_distribution()
print("Answer 10:")
print(answer10)

{200: 568345, 302: 30295, 304: 97792, 400: 15, 401: 46, 403: 4741, 404: 23517, 500: 42, 501: 43}
Answer 10:
{}
