In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor

# ===== 1) 讀資料 =====
DATA_DIR = "/kaggle/input/imdb-rotten-tomatoes"  
train_path = os.path.join(DATA_DIR, "train.csv")
test_path  = os.path.join(DATA_DIR, "test.csv")
sub_path   = os.path.join(DATA_DIR, "sample_submission.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)
sub   = pd.read_csv(sub_path)

# ===== 2) 自動找 target 欄位 =====
target_candidates = list(set(train.columns) - set(test.columns))
if len(target_candidates) != 1:
    if sub.shape[1] >= 2:
        target = sub.columns[1]
        print("[Warn] 無法用欄位差唯一決定 target，改用 sample_submission 第2欄推定:", target)
    else:
        raise ValueError(f"找不到唯一 target 候選：{target_candidates}，請檢查 train/test 欄位")
else:
    target = target_candidates[0]
    print("Detected target:", target)

X = train.drop(columns=[target])
y = train[target]

# ===== 3) 判斷任務：分類 vs 回歸 =====
# 規則：若 y 是數字但唯一值很少(<=20 且都是整數)，多半是分類；否則回歸
is_numeric = pd.api.types.is_numeric_dtype(y)
unique_vals = y.dropna().unique()
task = "regression"

if (not is_numeric) or (len(unique_vals) <= 20 and np.all(np.equal(np.mod(unique_vals, 1), 0))):
    task = "classification"

print("Task:", task, "| y dtype:", y.dtype, "| unique:", len(unique_vals))

# ===== 4) 欄位型態分流：文字 / 類別 / 數值 =====
text_cols = []
cat_cols = []
num_cols = []

for c in X.columns:
    if X[c].dtype == "object":
        # 文字欄位：用平均字數判斷（>4 個詞就當文字；否則當類別）
        sample = X[c].dropna().astype(str).head(200)
        avg_words = sample.apply(lambda s: len(s.split())).mean() if len(sample) else 0
        if avg_words >= 5:
            text_cols.append(c)
        else:
            cat_cols.append(c)
    else:
        num_cols.append(c)

print("num_cols:", len(num_cols), "cat_cols:", len(cat_cols), "text_cols:", len(text_cols))
if text_cols:
    print("text_cols example:", text_cols[:5])

# ===== 5) 前處理器：數值補值 + 類別 one-hot + 文字 tf-idf =====
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# 文字：多欄文字先合併成一欄再 TF-IDF（簡單而有效）
def join_text_columns(df):
    out = df[text_cols].fillna("").astype(str).agg(" ".join, axis=1)
    return out

from sklearn.preprocessing import FunctionTransformer
text_tf = Pipeline(steps=[
    ("join", FunctionTransformer(lambda df: join_text_columns(df), validate=False)),
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1,3),
        min_df=2
    ))
])

transformers = []
if num_cols:
    transformers.append(("num", numeric_tf, num_cols))
if cat_cols:
    transformers.append(("cat", categorical_tf, cat_cols))
if text_cols:
    transformers.append(("txt", text_tf, text_cols))

preprocess = ColumnTransformer(
    transformers=transformers,
    remainder="drop",
    sparse_threshold=0.5
)

# ===== 6) 模型：分類/回歸各給一個穩健 baseline =====
if task == "classification":
    model = LogisticRegression(max_iter=3000, n_jobs=-1)
else:
    model = Ridge(alpha=2.0)  # 文字回歸

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

# ===== 7) 本地切分驗證 =====
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=y if task=="classification" and len(unique_vals) > 1 else None
)

clf.fit(X_tr, y_tr)
pred_va = clf.predict(X_va)

if task == "classification":
    acc = accuracy_score(y_va, pred_va)
    f1 = f1_score(y_va, pred_va, average="macro")
    print(f"Validation Accuracy: {acc:.4f} | Macro-F1: {f1:.4f}")
else:
    rmse = mean_squared_error(y_va, pred_va, squared=False)
    mae  = mean_absolute_error(y_va, pred_va)
    print(f"Validation RMSE: {rmse:.4f} | MAE: {mae:.4f}")

# ===== 8) 全資料重訓 + 產生 submission =====
clf.fit(X, y)

need_proba = False
if task == "classification":
    sub_col = sub.columns[1]
    # 嘗試把 sample_submission 的目標欄強制轉成數字；解析失敗會是 NaN
    sub_vals_num = pd.to_numeric(sub[sub_col], errors='coerce')

    # 1) 型別或內容看起來像浮點數
    looks_numeric = pd.api.types.is_float_dtype(sub[sub_col]) or sub_vals_num.notna().all()

    # 2) 值域是否 (0,1) 的機率（用 inclusive="both" 含 0/1，且排除 NaN）
    looks_like_prob = sub_vals_num.dropna().between(0, 1, inclusive="both").all() and sub_vals_num.notna().any()

    # 3) 模型是否提供 predict_proba
    supports_proba = hasattr(clf.named_steps["model"], "predict_proba")

    if supports_proba and (looks_numeric and looks_like_prob):
        need_proba = True
    else:
        need_proba = False


if task == "classification" and need_proba:
    test_pred = clf.predict_proba(test)[:, -1]
else:
    test_pred = clf.predict(test)

submission = sub.copy()
submission[submission.columns[1]] = test_pred
submission.to_csv("submission.csv", index=False)

print("Saved: submission.csv")
submission.head()


Detected target: sentiment
Task: classification | y dtype: object | unique: 2
num_cols: 0 cat_cols: 0 text_cols: 1
text_cols example: ['review']
Validation Accuracy: 0.9008 | Macro-F1: 0.9007
Saved: submission.csv


Unnamed: 0,id,sentiment
0,0,negative
1,1,positive
2,2,negative
3,3,negative
4,4,positive
