In [54]:
!pip install scikit-learn==1.3.2

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting scikit-learn==1.3.2
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d0/0b/26ad95cf0b747be967b15fb71a06f5ac67aba0fd2f9cd174de6edefc4674/scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m134.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.0
    Uninstalling scikit-learn-1.2.0:
      Successfully uninstalled scikit-learn-1.2.0
Successfully installed scikit-learn-1.3.2


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import pandas as pd


In [56]:
# 加载数据并合并
low_calories = pd.read_excel('Low_Calorie_Foods.xlsx')
medium_calories = pd.read_excel('Medium_Calorie_Foods.xlsx')
high_calories = pd.read_excel('High_Calorie_Foods.xlsx')
data1 = pd.concat([low_calories, medium_calories, high_calories])

In [57]:
# 添加分类标签
data1['Calorie_Level'] = pd.cut(data1['Calories_numeric'], bins=[0, 100, 300, np.inf], labels=[0, 1, 2])

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

# 特征工程
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data1['Food'])

In [59]:
# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, data1['Calorie_Level'], test_size=0.2, random_state=42)

In [60]:
# 超参数调优
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [61]:
# 最佳模型
best_model = grid_search.best_estimator_

In [62]:
# 重新评估模型
predictions = best_model.predict(X_test)
print("Best parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.49557522123893805
Confusion Matrix:
 [[15 28  0]
 [ 5 28  8]
 [ 5 11 13]]


In [None]:
!pip install joblib

In [63]:
from joblib import dump
# 保存模型和TF-IDF矢量化器
dump(best_model, 'random_forest_model.joblib')
dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']