In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import RandomOverSampler

# Step 1: Load and preprocess dataset
df = pd.read_csv("../data/recipes.csv")

# Clean text data (example cleaning function)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    return text

df['cleaned_ingredients'] = df['ingredients'].apply(clean_text)

# Vectorize the ingredients
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
X = vectorizer.fit_transform(df['cleaned_ingredients'])
y = df['recipe_name']

# Step 2: Filter out classes with fewer than 2 samples
min_samples = 2  # Minimum samples required in a class
class_counts = pd.Series(y).value_counts()
valid_classes = class_counts[class_counts >= min_samples].index
mask = y.isin(valid_classes)

# Apply the mask to X and y
y = y[mask]
X = X[mask.to_numpy()]  # Convert mask to numpy for sparse matrix compatibility

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Resampling with error handling
def resample_with_error_handling(X_train, y_train):
    """
    Resample the dataset using SMOTEENN with custom configurations. If errors are encountered,
    problematic classes will be dynamically removed.
    """
    smote = SMOTE(k_neighbors=1, random_state=42)  # Use k_neighbors=1 to handle minority classes
    enn = EditedNearestNeighbours(n_neighbors=1)  # Use n_neighbors=1 for the ENN step
    smote_enn = SMOTEENN(smote=smote, enn=enn, random_state=42)

    while True:
        try:
            # Attempt to resample
            X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
            return X_resampled, y_resampled
        except ValueError as e:
            # Handle the ValueError caused by rare classes
            if "Expected n_neighbors <= n_samples_fit" in str(e):
                print("Rare classes detected. Removing problematic classes...")
                # Identify and remove classes with fewer than 2 samples
                class_counts = pd.Series(y_train).value_counts()
                valid_classes = class_counts[class_counts >= 2].index
                mask = y_train.isin(valid_classes)

                # Update X_train and y_train
                y_train = y_train[mask]
                X_train = X_train[mask.to_numpy()]
            else:
                # Raise other unexpected errors
                raise e

# Apply resampling with error handling
X_resampled, y_resampled = resample_with_error_handling(X_train, y_train)

# Step 4: Print summary of resampling
print("Data resampling with SMOTEENN complete!")
print(f"Original training dataset size: {len(y_train)}")
print(f"Resampled training dataset size: {len(y_resampled)}")
print(f"Number of unique classes after resampling: {len(pd.Series(y_resampled).value_counts())}")

# Continue with the rest of your model training pipeline...

Rare classes detected. Removing problematic classes...
Data resampling with SMOTEENN complete!
Original training dataset size: 152
Resampled training dataset size: 260
Number of unique classes after resampling: 52
