In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "SHORT VERSION OF FOOD NUTRITUION.xlsx"  # Ensure this file is uploaded to Colab
xls = pd.ExcelFile(file_path)
df = pd.read_excel(xls, sheet_name="Sheet1")

# Data Cleaning
print("Cleaning and preprocessing data...")
df["categories_en"] = df["categories_en"].fillna("")
df["traces_en"] = df["traces_en"].fillna("")

# Define expanded gluten and dairy keywords
allergen_keywords = [
    "gluten", "wheat", "barley", "rye", "dairy", "milk", "cheese", "butter",
    "casein", "lactose", "yogurt", "cream", "custard", "bread", "cracker", "pasta",
    "skimmed milk", "whole milk", "milk solids", "buttermilk", "margarine", "sour cream",
    "ghee", "cheddar", "mozzarella", "parmesan", "ricotta", "cottage cheese", "feta",
    "spelt", "oats", "couscous", "semolina", "farro", "malt", "seitan", "durum"
]
pattern = "|".join(allergen_keywords)
df["allergen_label"] = (
    df["categories_en"].str.contains(pattern, case=False, na=False) |
    df["traces_en"].str.contains(pattern, case=False, na=False)
).astype(int)

# Define features and labels
X = df["categories_en"] + " " + df["traces_en"]
y = df["allergen_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create SVM and Random Forest models
svm_model = SVC(kernel="rbf", C=5.0, gamma="scale", probability=True)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create an ensemble model using VotingClassifier
ensemble_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 5), stop_words="english", max_features=10000, analyzer='char_wb')),
    ("classifier", VotingClassifier(estimators=[('svm', svm_model), ('rf', rf_model)], voting='soft'))
])

# Train ensemble model
print("Training ensemble model (SVM + Random Forest) and evaluating accuracy...")
ensemble_pipeline.fit(X_train, y_train)
y_pred = ensemble_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")

# Filter allergen-free foods using the ensemble model
df["allergen_prediction"] = ensemble_pipeline.predict(df["categories_en"] + " " + df["traces_en"])
filtered_df = df[df["allergen_prediction"] == 0].drop(columns=["allergen_prediction"])

# Save filtered dataset
filtered_file_path = "filtered_allergy_free_foods_ensemble.csv"
filtered_df.to_csv(filtered_file_path, index=False)
print(f"Filtered dataset saved as {filtered_file_path}. Upload and download it from Google Colab!")


Cleaning and preprocessing data...
Training ensemble model (SVM + Random Forest) and evaluating accuracy...




Ensemble Model Accuracy: 0.9926
Filtered dataset saved as filtered_allergy_free_foods_ensemble.csv. Upload and download it from Google Colab!
