## **Laboratorio 6**
- Joaquín Campos - 22155
- Sofía García - 22210
- Julio García Salas - 22076

## **Inciso 1 y 2**

In [None]:
# Paso 1–2 (versión robusta): Cargar y normalizar tweets
# Soporta:
#  - JSON array / objeto / JSONL (una por línea)
#  - Texto plano: 1 tweet por línea (fallback), con extracción de @menciones, #hashtags, RT y reply
#
# Salidas:
#  - Imprime "RESUMEN DE CARGA (Paso 1–2)" al final
#  - Guarda 'tweets_raw_sample.csv' con 50 filas si hay datos
#  - Intenta guardar 'tweets_raw.parquet' si tienes pyarrow/fastparquet

from pathlib import Path
import json
import pandas as pd
import re
from typing import List, Dict, Any, Optional

# --- Localiza archivos en el directorio actual o en /mnt/data ---
def resolve_paths():
    local = [Path("traficogt.txt"), Path("tioberny.txt")]
    mnt = [Path("/mnt/data/traficogt.txt"), Path("/mnt/data/tioberny.txt")]
    final = []
    for p_local, p_mnt in zip(local, mnt):
        if p_local.exists():
            final.append(p_local)
        elif p_mnt.exists():
            final.append(p_mnt)
        else:
            final.append(p_local)  # por si están en otra ruta; lo marcamos como NO_FILE más abajo
    return final

DATA_PATHS = resolve_paths()
OUT_PARQUET = Path("tweets_raw.parquet")
OUT_SAMPLE_CSV = Path("tweets_raw_sample.csv")

# --- Utilidades de parseo ---
def read_any_json_whole(raw_stripped: str):
    """Intenta parsear el contenido completo como JSON válido."""
    try:
        obj = json.loads(raw_stripped)
        if isinstance(obj, list):
            return [x for x in obj if isinstance(x, dict)]
        if isinstance(obj, dict):
            if "tweets" in obj and isinstance(obj["tweets"], list):
                return [x for x in obj["tweets"] if isinstance(x, dict)]
            if "data" in obj and isinstance(obj["data"], list):
                return [x for x in obj["data"] if isinstance(x, dict)]
            return [obj]
    except Exception:
        return None

def read_jsonl_lines(raw: str):
    """Intenta parsear línea por línea como JSONL. Devuelve (records, n_json_ok)."""
    recs = []
    ok = 0
    for line in raw.splitlines():
        s = line.strip().rstrip(",")
        if not s:
            continue
        try:
            obj = json.loads(s)
            if isinstance(obj, dict):
                recs.append(obj)
                ok += 1
        except Exception:
            continue
    return recs, ok

def extract_json_from_line(line: str):
    """
    Intenta rescatar un bloque {...} en una línea que no es JSON puro.
    Toma del primer '{' al último '}' y hace json.loads.
    """
    start = line.find("{")
    end = line.rfind("}")
    if start != -1 and end != -1 and end > start:
        blob = line[start:end+1]
        try:
            obj = json.loads(blob)
            if isinstance(obj, dict):
                return obj
        except Exception:
            return None
    return None

def norm_username(u: Optional[str]) -> Optional[str]:
    if not u:
        return u
    u = u.strip()
    if u.startswith("@"):
        u = u[1:]
    return u.lower()

def extract_list_usernames(mentioned: Any) -> List[str]:
    out: List[str] = []
    if isinstance(mentioned, list):
        for m in mentioned:
            if isinstance(m, dict):
                un = m.get("username") or m.get("screen_name") or m.get("name")
                if un:
                    out.append(norm_username(un))
            elif isinstance(m, str):
                out.append(norm_username(m))
    return [x for x in out if x]

def hashtags_to_list(h: Any) -> List[str]:
    out: List[str] = []
    if isinstance(h, list):
        for item in h:
            if isinstance(item, str):
                out.append(item.lstrip("#").lower())
            elif isinstance(item, dict):
                txt = item.get("text") or item.get("tag")
                if txt:
                    out.append(str(txt).lstrip("#").lower())
    return out

def get_text(rec: Dict[str, Any]) -> Optional[str]:
    for k in ("rawContent", "full_text", "text"):
        val = rec.get(k)
        if isinstance(val, str) and val.strip():
            return val
    return None

def get_user_obj(rec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    u = rec.get("user")
    return u if isinstance(u, dict) else None

def safe_int(x):
    try:
        return int(x)
    except Exception:
        return None

# Regex para fallback de texto plano
MENTION_RE = re.compile(r"@([A-Za-z0-9_]{1,15})")
HASHTAG_RE = re.compile(r"#([A-Za-z0-9_]+)")

def rows_from_json(records: List[Dict[str, Any]], source_file: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    for r in records:
        u = get_user_obj(r)
        uname = None
        uid = None
        if u:
            uname = u.get("username") or u.get("screen_name") or u.get("name")
            uid = u.get("id") or u.get("id_str")

        # Mentions
        mentions = []
        if "mentionedUsers" in r:
            mentions = extract_list_usernames(r.get("mentionedUsers"))
        elif "entities" in r and isinstance(r["entities"], dict):
            mentions = extract_list_usernames(r["entities"].get("user_mentions"))

        # Hashtags
        if "hashtags" in r:
            tags = hashtags_to_list(r.get("hashtags"))
        elif "entities" in r and isinstance(r["entities"], dict):
            tags = hashtags_to_list(r["entities"].get("hashtags"))
        else:
            tags = []

        # RT / Quote
        rt = r.get("retweetedTweet")
        qt = r.get("quotedTweet")
        is_rt = rt is not None
        is_qt = qt is not None
        rt_user = norm_username(rt.get("user", {}).get("username")) if isinstance(rt, dict) else None
        qt_user = norm_username(qt.get("user", {}).get("username")) if isinstance(qt, dict) else None

        # Reply
        in_reply_to_user = r.get("inReplyToUser")
        reply_to_username = None
        if isinstance(in_reply_to_user, dict):
            reply_to_username = norm_username(in_reply_to_user.get("username"))
        if not reply_to_username and r.get("in_reply_to_screen_name"):
            reply_to_username = norm_username(r.get("in_reply_to_screen_name"))

        # Métricas
        like_count = safe_int(r.get("likeCount") or r.get("favorite_count"))
        rt_count = safe_int(r.get("retweetCount") or r.get("retweet_count"))
        reply_count = safe_int(r.get("replyCount") or r.get("reply_count"))
        quote_count = safe_int(r.get("quoteCount") or r.get("quote_count"))
        view_count = safe_int(r.get("viewCount") or r.get("views"))

        # Fecha
        date_raw = r.get("date") or r.get("created_at")
        try:
            date_parsed = pd.to_datetime(date_raw)
        except Exception:
            date_parsed = pd.NaT

        rows.append({
            "source_file": source_file,
            "tweet_id": r.get("id") or r.get("id_str"),
            "date": date_parsed,
            "lang": r.get("lang"),
            "username": norm_username(uname),
            "user_id": uid,
            "text": get_text(r),
            "mentions": mentions,
            "hashtags": tags,
            "is_retweet": bool(is_rt),
            "is_quote": bool(is_qt),
            "retweeted_user": rt_user,
            "quoted_user": qt_user,
            "reply_to_user": reply_to_username,
            "in_reply_to_tweet_id": r.get("inReplyToTweetId") or r.get("in_reply_to_status_id_str") or r.get("in_reply_to_status_id"),
            "like_count": like_count,
            "retweet_count": rt_count,
            "reply_count": reply_count,
            "quote_count": quote_count,
            "view_count": view_count,
            "raw_record": r,
        })
    return rows

def rows_from_plaintext(lines: List[str], source_file: str) -> List[Dict[str, Any]]:
    """
    Fallback para texto plano: 1 tweet por línea.
    Deriva menciones/hashtags/RT/reply desde el texto.
    """
    rows = []
    for i, raw in enumerate(lines, start=1):
        txt = raw.strip()
        if not txt:
            continue
        # Detecta RT y usuario retuiteado (formato típico "RT @usuario: ...")
        is_rt = False
        rt_user = None
        m_rt = re.match(r"^\s*RT\s+@([A-Za-z0-9_]{1,15})\b", txt)
        if m_rt:
            is_rt = True
            rt_user = m_rt.group(1).lower()

        # Menciones y hashtags
        mentions = [m.lower() for m in MENTION_RE.findall(txt)]
        hashtags = [h.lower() for h in HASHTAG_RE.findall(txt)]

        # Reply si inicia con @usuario
        reply_to_user = None
        m_reply = re.match(r"^\s*@([A-Za-z0-9_]{1,15})\b", txt)
        if m_reply:
            reply_to_user = m_reply.group(1).lower()

        rows.append({
            "source_file": source_file,
            "tweet_id": f"{source_file}:{i}",  # ID sintético basado en línea
            "date": pd.NaT,
            "lang": None,
            "username": None,        # desconocido en texto plano
            "user_id": None,
            "text": txt,
            "mentions": mentions,
            "hashtags": hashtags,
            "is_retweet": is_rt,
            "is_quote": False,       # no detectable con solo texto plano
            "retweeted_user": rt_user,
            "quoted_user": None,
            "reply_to_user": reply_to_user,
            "in_reply_to_tweet_id": None,
            "like_count": None,
            "retweet_count": None,
            "reply_count": None,
            "quote_count": None,
            "view_count": None,
            "raw_record": {"_raw_line": txt},
        })
    return rows

# --- Proceso de carga para cada archivo ---
all_rows: List[Dict[str, Any]] = []
summary = []
for p in DATA_PATHS:
    status = "NO_FILE"
    n_lines = 0
    n_json_whole = 0
    n_jsonl = 0
    n_json_inline = 0
    used_plaintext = 0

    if p.exists():
        status = "OK"
        raw = p.read_text(encoding="utf-8", errors="ignore")
        lines = raw.splitlines()
        n_lines = len(lines)

        # 1) Intenta JSON global
        recs = read_any_json_whole(raw.strip())
        if isinstance(recs, list) and recs:
            all_rows.extend(rows_from_json(recs, p.name))
            n_json_whole = len(recs)
        else:
            # 2) JSONL
            recs_jsonl, ok = read_jsonl_lines(raw)
            if ok > 0:
                all_rows.extend(rows_from_json(recs_jsonl, p.name))
                n_jsonl = ok
            else:
                # 3) Intento por línea: JSON incrustado o texto plano
                temp_rows = []
                for line in lines:
                    obj = extract_json_from_line(line)
                    if obj is not None:
                        n_json_inline += 1
                        temp_rows.extend(rows_from_json([obj], p.name))
                    else:
                        # fallback texto plano
                        temp_rows.extend(rows_from_plaintext([line], p.name))
                        used_plaintext += 1
                all_rows.extend(temp_rows)

    summary.append({
        "file": p.name,
        "status": status,
        "n_lines": n_lines,
        "n_json_whole": n_json_whole,
        "n_jsonl": n_jsonl,
        "n_json_inline": n_json_inline,
        "used_plaintext": used_plaintext,
    })

# --- DataFrame final ---
df = pd.DataFrame(all_rows)
if not df.empty:
    # Ordena por fecha si existe
    if "date" in df.columns:
        df = df.sort_values("date", na_position="last").reset_index(drop=True)

# --- Guardados para siguientes incisos ---
if not df.empty:
    OUT_SAMPLE_CSV.write_text("")  # asegura que el path exista en algunos entornos
    df.head(50).to_csv(OUT_SAMPLE_CSV, index=False)
    try:
        df.to_parquet(OUT_PARQUET, index=False)
        parquet_path = str(OUT_PARQUET.resolve())
    except Exception:
        parquet_path = "(No se guardó Parquet: instala 'pyarrow' o 'fastparquet')"
else:
    parquet_path = "(DataFrame vacío)"

# --- Resumen ---
print("=== RESUMEN DE CARGA (Paso 1–2) ===")
print("Archivos buscados:")
for p in DATA_PATHS:
    print(" -", p)

for s in summary:
    print(f"- {s['file']:15s} | estado={s['status']:7s} | líneas={s['n_lines']:5d} | "
          f"JSON_global={s['n_json_whole']:5d} | JSONL={s['n_jsonl']:5d} | "
          f"JSON_inline={s['n_json_inline']:5d} | texto_plano={s['used_plaintext']:5d}")

print(f"\nTotal de filas normalizadas: {len(df):,}")
if not df.empty:
    print("Columnas:", list(df.columns))
    print(f"Muestra CSV (50 filas): {str(OUT_SAMPLE_CSV.resolve())}")
    print(f"Parquet: {parquet_path}")
else:
    print("Nota: DataFrame vacío. Si tus archivos tienen un formato distinto, compárteme 10–15 líneas de ejemplo.")


=== RESUMEN DE CARGA (Paso 1–2) ===
Archivos buscados (existentes marcados como OK más abajo):
 - traficogt.txt
 - tioberny.txt
- traficogt.txt   | estado=OK      | líneas=11209 | objetos_JSON=    0
- tioberny.txt    | estado=OK      | líneas=10039 | objetos_JSON=    0

Total de filas normalizadas: 0
Nota: No se generaron archivos de salida porque no se detectaron objetos JSON válidos.
