## Analyze Nginx logs

### Load data

In [34]:
import json

filename = "./goaccess_data.json"
with open(filename, "r") as f:
    data = json.load(f)

### Helper functions

In [35]:
from pandas import DataFrame


def extract_metric(item: dict, metric_name: str) -> float | None:
    return item.get(metric_name) if isinstance(item, dict) else None


def process_nested_columns(df: DataFrame, columns: list[str]) -> DataFrame:
    for col in columns:
        if col in df.columns:
            df[f"{col}_count"] = df[col].apply(lambda x: extract_metric(x, "count"))
            df[f"{col}_percent"] = df[col].apply(lambda x: extract_metric(x, "percent"))
            df = df.drop(columns=[col])

    return df

In [36]:
import plotly.express as px
import plotly.graph_objects as go

from plotly.graph_objects import Figure
from plotly.subplots import make_subplots


colors = {
    "primary": "#2E86AB",
    "secondary": "#A23B72",
    "accent": "#F18F01",
    "background": "#ffffff",
    "text": "#2c3e50",
}


def style_figure(
    fig: Figure,
    title: str,
    legend: dict = dict(
        orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
    ),
) -> Figure:
    fig.update_layout(
        title=dict(
            text=title, font=dict(size=20, family="Arial", color=colors["text"])
        ),
        template="plotly_white",
        font=dict(family="Arial", color=colors["text"]),
        showlegend=True,
        hovermode="x unified",
        margin=dict(l=40, r=40, t=80, b=40),
        legend=legend,
    )
    return fig

### General stats

In [37]:
general = data.get("general", {})
print(f"Start date: {general.get('start_date')}")
print(f"End date: {general.get('end_date')}")
print(f"Total requests: {general.get('total_requests')}")
print(f"Unique visitors: {general.get('unique_visitors')}")
print(f"Bandwidth: {general.get('bandwidth', 0) / (1024 * 1024):.2f} MB")

Start date: 27/Nov/2025
End date: 11/Dec/2025
Total requests: 17143
Unique visitors: 2908
Bandwidth: 226.24 MB


### Daily traffic (visitors & hits)

In [38]:
import pandas as pd

visitors_data = data.get("visitors", {}).get("data", [])
df_visitors = pd.DataFrame(visitors_data)
df_visitors = process_nested_columns(df_visitors, ["hits", "visitors", "bytes"])
df_visitors["date"] = pd.to_datetime(df_visitors["data"], format="%Y%m%d")
df_visitors = df_visitors.sort_values("date").reset_index(drop=True)
df_visitors

Unnamed: 0,data,hits_count,hits_percent,visitors_count,visitors_percent,bytes_count,bytes_percent,date
0,20251127,517,3.02,158,5.43,17155886,7.23,2025-11-27
1,20251128,581,3.39,165,5.67,14354500,6.05,2025-11-28
2,20251129,318,1.85,139,4.78,5501433,2.32,2025-11-29
3,20251130,352,2.05,125,4.3,10040885,4.23,2025-11-30
4,20251201,539,3.14,143,4.92,16950220,7.14,2025-12-01
5,20251202,544,3.17,138,4.75,35994432,15.17,2025-12-02
6,20251203,753,4.39,159,5.47,31272115,13.18,2025-12-03
7,20251204,228,1.33,105,3.61,8686350,3.66,2025-12-04
8,20251205,452,2.64,195,6.71,10749709,4.53,2025-12-05
9,20251206,627,3.66,121,4.16,19365187,8.16,2025-12-06


In [39]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df_visitors["date"],
        y=df_visitors["hits_count"],
        name="Total Hits",
        marker_color=colors["primary"],
        opacity=0.3,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df_visitors["date"],
        y=df_visitors["visitors_count"],
        name="Unique Visitors",
        mode="lines+markers",
        line=dict(color=colors["secondary"], width=3),
        marker=dict(size=6),
    ),
    secondary_y=True,
)

fig.update_yaxes(title_text="Hits", secondary_y=False, showgrid=False)
fig.update_yaxes(
    title_text="Visitors", secondary_y=True, showgrid=True, gridcolor="#eee"
)
style_figure(fig, "Daily traffic: Hits vs Visitors")
fig.show()

### Top visitor IPs

In [40]:
hosts_data = data.get("hosts", {}).get("data", [])
df_hosts = pd.DataFrame(hosts_data)
df_hosts = process_nested_columns(df_hosts, ["hits"])
df_hosts[["data", "hits_count", "hits_percent"]].head(5)

Unnamed: 0,data,hits_count,hits_percent
0,45.148.10.246,5542,32.33
1,4.189.120.86,488,2.85
2,4.193.241.161,370,2.16
3,4.230.26.9,370,2.16
4,4.189.122.191,314,1.83


In [41]:
top_n = 10
df_plot = df_hosts.head(top_n).sort_values("hits_count", ascending=True)

fig = px.bar(
    df_plot,
    x="hits_count",
    y="data",
    orientation="h",
    text="hits_count",
    color_discrete_sequence=[colors["primary"]],
)

fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
fig.update_layout(yaxis_title=None, xaxis_title="Hits")
style_figure(fig, f"Top {top_n} visitor IPs")
fig.show()

### Top 404s

In [42]:
not_found_data = data.get("not_found", {}).get("data", [])
df_404 = pd.DataFrame(not_found_data)
df_404 = process_nested_columns(df_404, ["hits"])
df_404[["data", "hits_count", "hits_percent"]].head(10)

Unnamed: 0,data,hits_count,hits_percent
0,/assets/images/,23,0.13
1,/api/.env,21,0.12
2,/assets/as.php,11,0.06
3,/autoload_classmap/function.php,10,0.06
4,/aa.php,7,0.04
5,/abcd.php,7,0.04
6,/admin.php,7,0.04
7,/about.php,7,0.04
8,/ahax.php,7,0.04
9,/akcc.php,7,0.04


In [43]:
top_n = 10
df_plot = df_404.head(top_n).sort_values("hits_count", ascending=True)

fig = px.bar(
    df_plot,
    x="hits_count",
    y="data",
    orientation="h",
    text="hits_count",
    color_discrete_sequence=[colors["primary"]],
)

fig.update_traces(texttemplate="%{text:.2s}", textposition="outside")
fig.update_layout(yaxis_title=None, xaxis_title="Hits")
style_figure(fig, f"Top {top_n} 404 error URLs")
fig.show()

### Top OS

In [44]:
os_data = data.get("os", {}).get("data", [])
df_os = pd.DataFrame(os_data)
df_os = process_nested_columns(df_os, ["hits"])
df_os[["data", "hits_count", "hits_percent"]]

Unnamed: 0,data,hits_count,hits_percent
0,Unknown,6542,38.16
1,Android,2984,17.41
2,Windows,2538,14.8
3,iOS,1979,11.54
4,Crawlers,1553,9.06
5,Linux,1041,6.07
6,macOS,482,2.81
7,Chrome OS,10,0.06
8,Others,9,0.05
9,BSD,3,0.02


In [51]:
limit = 0.02
df_os_clean = df_os.copy()
df_os_clean["hits_percent"] = pd.to_numeric(
    df_os_clean["hits_percent"], errors="coerce"
)
df_os_clean.loc[df_os_clean["hits_percent"] < limit, "data"] = "Other"
df_os_grouped = df_os_clean.groupby("data", as_index=False).sum(numeric_only=True)

fig = px.pie(
    df_os_grouped,
    values="hits_count",
    names="data",
    hole=0.5,
    color_discrete_sequence=px.colors.qualitative.Prism,
)

fig.update_traces(textposition="inside", textinfo="percent+label")
style_figure(
    fig,
    "Operating system share",
    legend=dict(orientation="v", yanchor="top", y=0.99, xanchor="right", x=0.99),
)
fig.show()

### Top browsers

In [46]:
browser_data = data.get("browsers", {}).get("data", [])
df_browser = pd.DataFrame(browser_data)
df_browser = process_nested_columns(df_browser, ["hits"])
df_browser[["data", "hits_count", "hits_percent"]]

Unnamed: 0,data,hits_count,hits_percent
0,Others,6103,35.6
1,Chrome,5433,31.69
2,Safari,1796,10.48
3,Crawlers,1547,9.02
4,Firefox,1257,7.33
5,Unknown,482,2.81
6,Edge,192,1.12
7,Opera,109,0.64
8,MSIE,53,0.31
9,Yandex.Brows,2,0.01


In [52]:
limit = 0.02  # 2%
df_browser_clean = df_browser.copy()
df_browser_clean["hits_percent"] = pd.to_numeric(
    df_browser_clean["hits_percent"], errors="coerce"
)
df_browser_clean.loc[df_browser_clean["hits_percent"] < limit, "data"] = "Other"
df_browser_grouped = df_browser_clean.groupby("data", as_index=False).sum(
    numeric_only=True
)

fig = px.pie(
    df_browser_grouped,
    values="hits_count",
    names="data",
    hole=0.5,
    color_discrete_sequence=px.colors.qualitative.Prism,
)

fig.update_traces(textposition="inside", textinfo="percent+label")
style_figure(
    fig,
    "Browser share",
    legend=dict(orientation="v", yanchor="top", y=0.99, xanchor="right", x=0.99),
)
fig.show()

Drill down into "others" browsers (often hides bots)

In [47]:
browser_items = data.get("browsers", {}).get("data", [])
others_browser = next(
    (item for item in browser_items if item["data"] == "Others"), None
)

if not others_browser:
    others_browser = {}
    others_browser["items"] = []

df_others = pd.DataFrame(others_browser["items"])
df_others = process_nested_columns(df_others, ["hits"])
df_others[["data", "hits_count", "hits_percent"]]

Unnamed: 0,data,hits_count,hits_percent
0,Go-http-client/2.0,5548,32.36
1,Mozilla/5.0,313,1.83
2,Go-http-client/1.1,137,0.8
3,okhttp/5.3.0,50,0.29
4,python-requests/2.32.5,15,0.09
5,UCBrowser/11.0.5.850,10,0.06
6,curl/7.61.1,6,0.03
7,curl/7.29.0,6,0.03
8,python-requests/2.32.3,3,0.02
9,python-requests/2.22.0,3,0.02


In [55]:
df_others_plot = df_others.sort_values("hits_count", ascending=False).head(10)

fig = px.bar(
    df_others_plot,
    x="data",
    y="hits_count",
    color="hits_count",
    color_continuous_scale="Bluyl",
)

fig.update_layout(xaxis_tickangle=-45)
style_figure(fig, "Breakdown of 'other' browsers (Potential bots)")
fig.show()