In [6]:
# =========================================================
# SPLIT MULTILINGÜE ES + PT
# =========================================================

import os
import pandas as pd
from sklearn.model_selection import train_test_split

SEED = 42

# ---------------------------
# 0) Definir directorio de salida
# ---------------------------
OUT_DIR = "/content/data"   # estándar
os.makedirs(OUT_DIR, exist_ok=True)

# ---------------------------
# 1) Cargar datos ES (CSV coma)
# ---------------------------
es = pd.read_csv("/content/train.csv")

for c in ("Unnamed: 0", "index"):
    if c in es.columns:
        es.drop(columns=c, inplace=True)

es = es[["stars", "review_title", "review_body", "language"]].copy()
es = es[es["language"] == "es"].copy()

print("ES:", es.shape)

# ---------------------------
# 2) Cargar datos PT (TSV tab)
# ---------------------------
pt = pd.read_csv(
    "/content/reviews_consolidado_perfecto_v2.csv",
    sep="\t",
    encoding="latin1",
    low_memory=False
)

pt = pt[["stars", "review_title", "review_body", "language"]].copy()
pt["language"] = pt["language"].str.lower()

print("PT:", pt.shape)

# ---------------------------
# 3) Unificar ES + PT
# ---------------------------
df = pd.concat([es, pt], ignore_index=True)

# ---------------------------
# 4) Definir label
# ---------------------------
df["stars"] = df["stars"].astype(int)

def stars_to_sentiment(s):
    if s in (1, 2):
        return "Negativo"
    if s == 3:
        return "Neutro"
    return "Positivo"

df["sentiment"] = df["stars"].map(stars_to_sentiment)

# ---------------------------
# 5) Construir texto raw
# ---------------------------
df["review_title"] = df["review_title"].fillna("").astype(str)
df["review_body"]  = df["review_body"].fillna("").astype(str)

df["text_raw"] = (
    df["review_title"].str.strip() + ". " +
    df["review_body"].str.strip()
).str.strip()

df = df[df["text_raw"].str.len() >= 5].copy()

# ---------------------------
# 6) Estrato (language + sentiment)
# ---------------------------
df["strata"] = df["language"] + "||" + df["sentiment"]

# ---------------------------
# 7) Split 80 / 10 / 10
# ---------------------------
train_df, temp_df = train_test_split(
    df,
    test_size=0.20,
    random_state=SEED,
    stratify=df["strata"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=SEED,
    stratify=temp_df["strata"]
)

# ---------------------------
# 8) Guardar archivos
# ---------------------------
KEEP_COLS = [
    "text_raw",
    "stars",
    "sentiment",
    "language",
    "review_title",
    "review_body"
]

train_out = train_df[KEEP_COLS].sample(frac=1, random_state=SEED).reset_index(drop=True)
val_out   = val_df[KEEP_COLS].sample(frac=1, random_state=SEED).reset_index(drop=True)
test_out  = test_df[KEEP_COLS].sample(frac=1, random_state=SEED).reset_index(drop=True)

train_path = os.path.join(OUT_DIR, "train_es_pt.csv")
val_path   = os.path.join(OUT_DIR, "validation_es_pt.csv")
test_path  = os.path.join(OUT_DIR, "test_es_pt.csv")

train_out.to_csv(train_path, index=False)
val_out.to_csv(val_path, index=False)
test_out.to_csv(test_path, index=False)

# ---------------------------
# 9) Sanity check
# ---------------------------
def report(df, name):
    print(f"\n{name}: {df.shape}")
    print(df["language"].value_counts())
    print(df["sentiment"].value_counts(normalize=True).round(3))
    print(pd.crosstab(df["language"], df["sentiment"], normalize="index").round(3))

report(train_out, "TRAIN")
report(val_out, "VALIDATION")
report(test_out, "TEST")

print("\nArchivos generados correctamente:")
print(train_path)
print(val_path)
print(test_path)

ES: (200000, 4)
PT: (230882, 4)

TRAIN: (344657, 6)
language
pt    184657
es    160000
Name: count, dtype: int64
sentiment
Positivo    0.585
Negativo    0.277
Neutro      0.138
Name: proportion, dtype: float64
sentiment  Negativo  Neutro  Positivo
language                             
es            0.400   0.200     0.400
pt            0.171   0.084     0.745

VALIDATION: (43082, 6)
language
pt    23082
es    20000
Name: count, dtype: int64
sentiment
Positivo    0.585
Negativo    0.277
Neutro      0.138
Name: proportion, dtype: float64
sentiment  Negativo  Neutro  Positivo
language                             
es            0.400   0.200     0.400
pt            0.171   0.084     0.745

TEST: (43083, 6)
language
pt    23083
es    20000
Name: count, dtype: int64
sentiment
Positivo    0.585
Negativo    0.277
Neutro      0.138
Name: proportion, dtype: float64
sentiment  Negativo  Neutro  Positivo
language                             
es            0.400   0.200     0.400
pt            0.17