In [1]:
import pandas as pd
import numpy as np

# Basic models and tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# XGBoost and LightGBM
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# VotingClassifier
from sklearn.ensemble import VotingClassifier

# ========== 1. Load data ==========

df = pd.read_csv('food_partial_labeled.csv')
# Columns: "food"(string), "category"(string), remaining columns are numeric features

# ========== 2. Split labeled/unlabeled data ==========

df_labeled = df.dropna(subset=["category"]).copy()   # labeled data
df_unlabeled = df[df["category"].isna()].copy()      # unlabeled data

# ========== 3. Define feature columns ==========

num_cols = df.columns.difference(["food", "category"])
features = list(num_cols)

# ========== 4. Prepare X and y ==========

X_labeled = df_labeled[features].copy()
y_labeled = df_labeled["category"].copy()

# Fill missing values (using mean)
X_labeled = X_labeled.fillna(X_labeled.mean())

# Encode labels ("meats", "vegetables", "carbs")
le = LabelEncoder()
y_labeled_encoded = le.fit_transform(y_labeled)

# ========== 5. Train-validation split and scaling ==========

X_train, X_val, y_train, y_val = train_test_split(
    X_labeled, y_labeled_encoded, 
    test_size=0.2, random_state=42, 
    stratify=y_labeled_encoded
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# ========== 6. Define four base models ==========

clf_lr = LogisticRegression(random_state=42)
clf_svc = SVC(probability=True, random_state=42)
clf_xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
clf_lgb = LGBMClassifier(random_state=42)

# ========== 7. Build VotingClassifier (soft voting) ==========

voting_clf = VotingClassifier(
    estimators=[
        ("lr", clf_lr),
        ("svc", clf_svc),
        ("xgb", clf_xgb),
        ("lgb", clf_lgb)
    ],
    voting="soft"
)

voting_clf.fit(X_train_scaled, y_train)

# Evaluate on validation set
val_accuracy = voting_clf.score(X_val_scaled, y_val)
print("Validation Accuracy:", val_accuracy)

# ========== 8. Predict unlabeled data and mark uncertain predictions as "others" ==========

X_unlabeled = df_unlabeled[features].copy()
X_unlabeled = X_unlabeled.fillna(X_unlabeled.mean())
X_unlabeled_scaled = scaler.transform(X_unlabeled)

proba_unlabeled = voting_clf.predict_proba(X_unlabeled_scaled)

threshold = 0.9  # Adjust threshold if necessary
pred_labels = []
for row_prob in proba_unlabeled:
    max_prob = np.max(row_prob)
    class_idx = np.argmax(row_prob)
    if max_prob < threshold:
        pred_labels.append("others")
    else:
        pred_labels.append(le.inverse_transform([class_idx])[0])

df_unlabeled["category_pred"] = pred_labels

# ========== 9. Post-processing: Change "meats" predictions containing dairy keywords to "others" ==========

dairy_keywords = ["cheese", "cream", "milk"]

for i in df_unlabeled.index:
    predicted_cat = df_unlabeled.at[i, "category_pred"]
    food_name = str(df_unlabeled.at[i, "food"]).lower()

    if predicted_cat == "meats" and any(kw in food_name for kw in dairy_keywords):
        df_unlabeled.at[i, "category_pred"] = "others"

# ========== 10. Display statistics ==========

print("Unlabeled data prediction distribution:")
print(df_unlabeled["category_pred"].value_counts())

print("\nDetailed statistics by category:")
for cat_name, count in df_unlabeled["category_pred"].value_counts().items():
    print(f"{cat_name}: {count} samples")

# Print foods predicted as vegetables
print("\nFoods predicted as vegetables:")
vegetables_records = df_unlabeled[df_unlabeled["category_pred"] == "vegetables"]
if not vegetables_records.empty:
    print(f"Total {len(vegetables_records)} samples:")
    for food in vegetables_records["food"]:
        print(f"- {food}")
else:
    print("No samples predicted as vegetables")

# Print foods predicted as meats
print("\nFoods predicted as meats:")
meats_records = df_unlabeled[df_unlabeled["category_pred"] == "meats"]
if not meats_records.empty:
    print(f"Total {len(meats_records)} samples:")
    for food in meats_records["food"]:
        print(f"- {food}")
else:
    print("No samples predicted as meats")

# Print foods predicted as carbs
print("\nFoods predicted as carbs:")
carbs_records = df_unlabeled[df_unlabeled["category_pred"] == "carbs"]
if not carbs_records.empty:
    print(f"Total {len(carbs_records)} samples:")
    for food in carbs_records["food"]:
        print(f"- {food}")
else:
    print("No samples predicted as carbs")

# ========== (Optional) Save results ==========

df_unlabeled.to_csv("food_labeled.csv", index=False)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 506
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 34
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Validation Accuracy: 1.0
Unlabeled data prediction distribution:
category_pred
others        1793
meats          258
carbs          164
vegetables     120
Name: count, dtype: int64

每个类别的详细统计：
others: 1793 个样本
meats: 258 个样本
carbs: 164 个样本
vegetables: 120 个样本

所有预测为vegetables类别的食物：
共找到 120 个vegetables样本：
- broccoli raab raw
- napa cabbage cooked
- kelp
- shallots raw
- red lettuce
- romanesco raw
- lemongrass citronella
- celery cooked
- serrano pepper
- sweet potato leaves cooked
- c

