# Instagram HTML → CSV Converter
**Important Note:** I have used chatGPT to create most of the functions you see in this file.

The script expects a **messages** directory in the root. This message directory you can directly copy paste from the instagram's exported data.
Image you get a .zip file from insatgram. Copy the **your_instagram_activity/messages** directory in the root.

**Columns:** Chat Session, Message Date, Type (Incoming/Outgoing), Text

## Steps
1. Install deps if needed.
2. Run the **Converter Functions** cell.
3. Use **Diagnostics** to confirm messages found for a sample file.
4. Run the conversion.


In [None]:
# If BeautifulSoup is not installed, uncomment:
# !pip install beautifulsoup4
#!pip install dateparser

In [None]:
import re
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup

# Try to use multilingual parsing
try:
    import dateparser
except Exception:
    dateparser = None

# Accept EN + DE month names, ISO, and lowercase am/pm
DATE_CANDIDATE_RE = re.compile(
    r"("

    # EN: Aug 11, 2025 5:19 am  /  Aug 11, 2025, 5:19 AM  (optional comma and seconds/TZ)
    r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}(?:,|\s)+\d{1,2}:\d{2}(?::\d{2})?\s*(?:am|pm|AM|PM)?(?:\s+[A-Z]{2,4})?"

    r"|"

    # EN alt: 11 Aug 2025 17:19
    r"\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)\s+\d{4}\s+\d{1,2}:\d{2}(?::\d{2})?(?:\s*(?:am|pm|AM|PM))?"

    r"|"

    # DE long: 14. August 2024, 22:15  (optional TZ)
    r"\d{1,2}\.\s*(?:Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember"
    r"|Jan\.|Feb\.|Mär\.|Apr\.|Mai|Jun\.|Jul\.|Aug\.|Sep\.|Okt\.|Nov\.|Dez\.)\s+\d{4},?\s+\d{1,2}:\d{2}(?::\d{2})?(?:\s*(?:MESZ|MEZ|CEST|CET))?"

    r"|"

    # ISO-ish
    r"\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}(:\d{2})?(?:[+-]\d{2}:\d{2}|Z)?"

    r")",
    re.IGNORECASE
)

DATE_FORMATS = [
    "%b %d, %Y, %I:%M %p %Z",
    "%b %d, %Y, %I:%M %p",
    "%b %d, %Y %I:%M %p",
    "%Y-%m-%d %H:%M:%S %Z",
    "%Y-%m-%d %H:%M:%S",
    "%d %b %Y %H:%M",
    "%Y-%m-%dT%H:%M:%S%z",
    "%Y-%m-%dT%H:%M:%S",
]

def parse_any_date(text: str):
    text = (text or "").replace("\xa0", " ").strip(" ,")
    # First try dateparser (handles lowercase am/pm and multiple languages)
    if dateparser is not None:
        dt = dateparser.parse(text, languages=["en", "de"])
        if dt:
            return dt
    # Fallback formats
    for fmt in DATE_FORMATS:
        try:
            return datetime.strptime(text, fmt)
        except Exception:
            pass
    # Try ISO slice
    m = re.search(r"\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}(:\d{2})?([+-]\d{2}:\d{2}|Z)?", text)
    if m:
        try:
            return datetime.fromisoformat(m.group(0).replace("Z", "+00:00"))
        except Exception:
            pass
    return None

def clean_text(node):
    if node is None:
        return ""
    for t in node.find_all(["img", "video", "audio"]):
        t.decompose()
    return re.sub(r"\s+", " ", " ".join(s.strip() for s in node.stripped_strings)).strip()

# Strip a trailing "date line" from the bottom of a message body
def strip_trailing_date_lines(text: str):
    if not text:
        return text
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    if not lines:
        return ""
    last = lines[-1]
    if DATE_CANDIDATE_RE.fullmatch(last):
        lines.pop()
    return "\n".join(lines).strip()

YOU_WORDS = {"you", "du", "ich", "me"}  # add more if needed
def guess_sender_type(sender, owners):
    s = (sender or "").strip().lower()
    if s in YOU_WORDS:
        return "Outgoing"
    for me in owners:
        if s == me.strip().lower():
            return "Outgoing"
    return "Incoming"

def _find_header_el(msg):
    for sel in ["header", "._a6-h", "._a6-i", ".message_header"]:
        el = msg.select_one(sel)
        if el:
            return el
    return None

def _find_body_el(msg):
    for sel in [".text", ".message_body", "div._a6-p", "blockquote", "p"]:
        el = msg.select_one(sel)
        if el:
            return el
    # fallback: use the whole block
    return msg

def _extract_sender_and_date(msg):
    # 1) Gather text lines; date is often the last visible line of the block
    all_text_lines = [l for l in msg.get_text("\n", strip=True).splitlines() if l.strip()]
    ts_text = None
    for line in reversed(all_text_lines):
        m = DATE_CANDIDATE_RE.search(line)
        if m:
            ts_text = m.group(0)
            break

    # 2) Sender from header (everything before first comma OR before the date token)
    sender = None
    header = _find_header_el(msg)
    header_text = clean_text(header) if header else ""
    if header_text:
        if ts_text and header_text.find(ts_text) != -1:
            sender = header_text[: header_text.find(ts_text)]
        else:
            # common pattern: "Sender, Aug 11, 2025 5:19 am"
            sender = header_text.split(",")[0]
    if not sender:
        b = msg.find(["strong", "b"])
        if b and b.get_text(strip=True):
            sender = b.get_text(strip=True)
    sender = (sender or "").strip(" ,–-•").strip()

    dt = parse_any_date(ts_text) if ts_text else None
    return sender, dt, ts_text

def extract_messages_from_html(filepath, owner_names):
    html = Path(filepath).read_text(encoding="utf-8", errors="ignore")
    soup = BeautifulSoup(html, "html.parser")

    # thread title
    def _extract_thread_title(soup, fp):
        h1 = soup.find("h1")
        if h1 and h1.get_text(strip=True):
            return h1.get_text(strip=True)
        if soup.title and soup.title.string:
            return soup.title.get_text(strip=True)
        p = Path(fp)
        try:
            if p.parent.parent.name.lower() == "inbox":
                return p.parent.name
        except Exception:
            pass
        return p.stem

    thread_title = _extract_thread_title(soup, filepath)

    # candidate message blocks
    blocks = soup.find_all(["div", "li", "section"], class_=re.compile(r"\bmessage\b", re.I))
    if not blocks:
        blocks = soup.find_all("div", class_=re.compile(r"^_a6-(g|i)$"))  # newer opaque classes
    if not blocks:
        tmp = []
        for blk in soup.find_all(["div", "li", "section", "article"]):
            if blk.find("p") or blk.find(class_=re.compile("text|body", re.I)):
                tmp.append(blk)
        blocks = tmp

    rows = []
    for msg in blocks:
        sender, dt, ts_text = _extract_sender_and_date(msg)

        body_el = _find_body_el(msg)
        body_text = clean_text(body_el)
        # Drop bottom date line if our body capture included it
        body_text = strip_trailing_date_lines(body_text)
        if not body_text:
            continue

        # Format as requested
        dt_str = (
            dt.strftime("%Y-%m-%d %H:%M:%S") if dt
            else (ts_text or "")
        )
        mtype = guess_sender_type(sender, owner_names)

        rows.append({
            "Chat Session": thread_title,
            "Message Date": dt_str,
            "Type": mtype,
            "Text": body_text,
        })
    return rows


## Diagnostics
Set `ROOT` and run `files = collect_html_files(ROOT)`, then try `debug_probe(files[0])` to check one sample.


In [None]:
# Example:
ROOT = './messages'
files = collect_html_files(ROOT)
print('Found', len(files), 'files'); print(files[:3])
debug_probe(files[0])

## Run Conversion
Fill in your paths and run.


In [None]:
INPUT_PATH = './messages'
OUTPUT_CSV = 'insta_1.csv'
run_conversion(INPUT_PATH, OUTPUT_CSV)
print('Ready. Define INPUT_PATH/OUTPUT_CSV and call run_conversion(INPUT_PATH, OUTPUT_CSV)')