In [4]:
import json
import pandas as pd

good = []
bad_lines = []

with open("events1.txt", "r", encoding="utf-8") as f:
    for line_num, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue

        try:
            good.append(json.loads(line))
        except json.JSONDecodeError:
            bad_lines.append({"line_num": line_num, "raw": line})

df = pd.DataFrame(good)

# Clean amount (INVALID -> NaN)
df["amount"] = pd.to_numeric(df["amount"], errors="coerce")

# Parse timestamp safely
df["ts"] = pd.to_datetime(df["ts"], errors="coerce", utc=True)

# Split clean vs invalid
invalid_amount = df[df["amount"].isna()]
clean_df = df.dropna(subset=["amount", "ts"])

print("Clean:")
print(clean_df)

print("\nBad JSON lines:")
print(pd.DataFrame(bad_lines))

print("\nInvalid amount rows:")
print(invalid_amount)


Clean:
   event_id user  amount                        ts
0         1    A   120.0 2024-01-01 10:00:00+00:00
2         3    C   300.0 2024-01-01 10:10:00+00:00

Bad JSON lines:
   line_num            raw
0         3  NOT_JSON_LINE

Invalid amount rows:
   event_id user  amount                        ts
1         2    B     NaN 2024-01-01 10:05:00+00:00


In [5]:
import json
import pandas as pd

rows = []
with open("nested_events.txt", "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.json_normalize(rows)  # flattens nested dicts into columns
print(df)


   event_id  amount  user.id user.name device.os device.ver
0         1     200       10       Ali       ios       17.1
1         2     150       11      Sara   android         14


In [11]:
import json
import pandas as pd

rows = []
with open("nested_drift.txt", "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

df = pd.json_normalize(rows)

# Enforce expected columns
expected = ["event_id", "user.id", "user.name",  "device.os", "device.ver", "amount"]
for col in expected:
    if col not in df.columns:
        df[col] = None

df = df[expected]

print(df)


   event_id  user.id user.name device.os device.ver  amount
0         1       10       Ali       ios        NaN     200
1         2       11       NaN       NaN        NaN     150
2         3       12      Noor   android         14     180


Scenario 4: The .txt contains one big JSON array (not lines)

In [12]:
# Solution: json.load() then normalize

import json
import pandas as pd

with open("array.txt", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.json_normalize(data)
print(df)


   id name meta.city
0   1    A    Lahore
1   2    B   Karachi


Scenario 5: JSON in TXT but file has garbage header/footer (super common)

In [14]:
# Solution: Extract only JSON lines
import json
import pandas as pd

rows = []
bad = []

with open("report_dump.txt", "r", encoding="utf-8") as f:
    for i, line in enumerate(f, start=1):
        line = line.strip()
        if not line.startswith("{"):
            continue
        try:
            rows.append(json.loads(line))
        except json.JSONDecodeError:
            bad.append({"line_num": i, "raw": line})

df = pd.DataFrame(rows)
print(df)
print("Bad JSON lines:", bad)


   id  value
0   1     10
1   2     20
Bad JSON lines: []
