# Random Forest Model

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))  # Convert string representation to list
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Random Forest Classifier for Calorie Level Prediction
rf_calorie = RandomForestClassifier(random_state=42, n_jobs=-1)  # n_jobs=-1 for parallel processing

# Fit the model
rf_calorie.fit(X_train[['name_length', 'ingredient_count', 'steps_length']], y_calorie_train)

# Evaluate the model for Calorie Level Prediction
y_calorie_pred = rf_calorie.predict(X_test[['name_length', 'ingredient_count', 'steps_length']])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))  # Convert string representation to list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_t

Accuracy for Calorie Level Prediction: 0.39048607410316105

Classification Report for Calorie Level Prediction:
               precision    recall  f1-score   support

           0       0.43      0.53      0.48     13934
           1       0.38      0.37      0.38     12785
           2       0.29      0.20      0.24      8934

    accuracy                           0.39     35653
   macro avg       0.37      0.37      0.36     35653
weighted avg       0.38      0.39      0.38     35653


Confusion Matrix for Calorie Level Prediction:
 [[7344 4582 2008]
 [5680 4755 2350]
 [3904 3207 1823]]


In [15]:
# Feature Engineering for Nutritional Cluster Prediction
X['techniques_count'] = X['techniques'].apply(lambda x: eval(x)).apply(len)
X['ingredient_ids_count'] = X['ingredient_ids'].apply(lambda x: eval(x)).apply(len)

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Random Forest Classifier for Nutritional Cluster Prediction
rf_cluster = RandomForestClassifier(random_state=42, n_jobs=-1)

# Fit the model
rf_cluster.fit(X_train[['name_length', 'ingredient_count', 'steps_length', 'techniques_count', 'ingredient_ids_count']],
               y_cluster_train)

# Evaluate the model for Nutritional Cluster Prediction
y_cluster_pred = rf_cluster.predict(X_test[['name_length', 'ingredient_count', 'steps_length', 'techniques_count', 'ingredient_ids_count']])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("Accuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


Accuracy for Nutritional Cluster Prediction: 0.3035929655288475

Classification Report for Nutritional Cluster Prediction:
               precision    recall  f1-score   support

           0       0.11      0.05      0.06      3266
           1       0.28      0.29      0.28      8934
           2       0.12      0.05      0.07      3430
           3       0.36      0.48      0.41     10504
           4       0.29      0.30      0.30      9519

    accuracy                           0.30     35653
   macro avg       0.23      0.23      0.23     35653
weighted avg       0.28      0.30      0.29     35653


Confusion Matrix for Nutritional Cluster Prediction:
 [[ 148  841  161 1264  852]
 [ 372 2558  432 3037 2535]
 [ 160  937  178 1191  964]
 [ 311 2222  335 5066 2570]
 [ 324 2588  364 3369 2874]]


# SVM model 


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# SVM Classifier for Calorie Level Prediction (Linear Kernel)
svm_calorie = SVC(kernel='linear', C=1.0, random_state=42)

# Fit the model
svm_calorie.fit(X_train[['name_length', 'ingredient_count', 'steps_length']], y_calorie_train)

# Evaluate the model for Calorie Level Prediction
y_calorie_pred = svm_calorie.predict(X_test[['name_length', 'ingredient_count', 'steps_length']])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction (Linear SVM):", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction (Linear SVM):\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction (Linear SVM):\n", confusion_matrix(y_calorie_test, y_calorie_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
def tokenize_and_preprocess(column):
    return column.apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else '')

preprocessor_text = ColumnTransformer(
    transformers=[
        ('name_tokens', CountVectorizer(), 'name_tokens'),
        ('ingredient_tokens', CountVectorizer(), 'ingredient_tokens'),
        ('steps_tokens', CountVectorizer(), 'steps_tokens'),
    ])

# Custom transformer to convert 'other_features' to numeric
def convert_to_numeric(data):
    return pd.DataFrame(data, columns=['techniques', 'ingredient_ids']).applymap(lambda x: eval(x)[0])

preprocessor_other = FunctionTransformer(convert_to_numeric, validate=False)

# Combine the two transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('text', preprocessor_text, X.columns[:3]),
        ('other', preprocessor_other, X.columns[3:])
    ])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# SVM Classifier for Calorie Level Prediction (Linear Kernel)
classifier = SVC(kernel='linear', C=1.0, random_state=42)

# Example of using a different classifier (Random Forest)
# classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Fit the model
pipeline.fit(X_train, y_calorie_train)

# Evaluate the model for Calorie Level Prediction
y_calorie_pred = pipeline.predict(X_test)
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))


  return pd.DataFrame(data, columns=['techniques', 'ingredient_ids']).applymap(lambda x: eval(x)[0])


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))  # Convert string representation to list
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# SVM for Calorie Level Prediction
svm_calorie = SVC(random_state=42)

# Fit the model
svm_calorie.fit(X_train[numerical_features], y_calorie_train)

# Evaluate the model for Calorie Level Prediction
y_calorie_pred = svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM for Nutritional Cluster Prediction
svm_cluster = SVC(random_state=42)

# Fit the model
svm_cluster.fit(X_train[numerical_features], y_cluster_train)

# Evaluate the model for Nutritional Cluster Prediction
y_cluster_pred = svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))  # Convert string representation to list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_t

Accuracy for Calorie Level Prediction: 0.43516674613637

Classification Report for Calorie Level Prediction:
               precision    recall  f1-score   support

           0       0.46      0.67      0.55     13934
           1       0.40      0.45      0.42     12785
           2       0.44      0.05      0.08      8934

    accuracy                           0.44     35653
   macro avg       0.43      0.39      0.35     35653
weighted avg       0.43      0.44      0.39     35653


Confusion Matrix for Calorie Level Prediction:
 [[9404 4383  147]
 [6709 5701  375]
 [4333 4191  410]]

Accuracy for Nutritional Cluster Prediction: 0.3552015258183042

Classification Report for Nutritional Cluster Prediction:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      3266
           1       0.32      0.32      0.32      8934
           2       0.00      0.00      0.00      3430
           3       0.38      0.66      0.48     10504
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # Import oversampling library

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Check for class imbalance and apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_calorie_train_resampled = oversampler.fit_resample(X_train[numerical_features], y_calorie_train)

# SVM for Calorie Level Prediction with hyperparameter tuning
svm_calorie = SVC(C=1.0, kernel='rbf', random_state=42)  # Add hyperparameters for tuning

# Fit the model using resampled data
svm_calorie.fit(X_train_resampled, y_calorie_train_resampled)

# Evaluate the model for Calorie Level Prediction using cross-validation
cv_accuracy_calorie = cross_val_score(svm_calorie, X_train_resampled, y_calorie_train_resampled, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy for Calorie Level Prediction:", cv_accuracy_calorie.mean())

# Evaluate the model on the test set
y_calorie_pred = svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM for Nutritional Cluster Prediction with hyperparameter tuning
svm_cluster = SVC(C=1.0, kernel='rbf', random_state=42)  # Add hyperparameters for tuning

# Fit the model
svm_cluster.fit(X_train_resampled, y_cluster_train)  # Use resampled data for training

# Evaluate the model for Nutritional Cluster Prediction using cross-validation
cv_accuracy_cluster = cross_val_score(svm_cluster, X_train_resampled, y_cluster_train, cv=5, scoring='accuracy')
print("\nCross-Validation Accuracy for Nutritional Cluster Prediction:", cv_accuracy_cluster.mean())

# Evaluate the model on the test set
y_cluster_pred = svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

Cross-Validation Accuracy for Calorie Level Prediction: 0.40380166771272297
Accuracy for Calorie Level Prediction: 0.410933161304799

Classification Report for Calorie Level Prediction:
               precision    recall  f1-score   support

           0       0.48      0.54      0.51     13934
           1       0.39      0.30      0.34     12785
           2       0.32      0.36      0.34      8934

    accuracy                           0.41     35653
   macro avg       0.40      0.40      0.40     35653
weighted avg       0.41      0.41      0.41     35653


Confusion Matrix for Calorie Level Prediction:
 [[7585 3544 2805]
 [5033 3842 3910]
 [3166 2544 3224]]


ValueError: Found input variables with inconsistent numbers of samples: [167295, 142612]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # Import oversampling library

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Check for class imbalance and apply oversampling to the training set for both predictions
oversampler_calorie = RandomOverSampler(random_state=42)
X_train_resampled_calorie, y_calorie_train_resampled = oversampler_calorie.fit_resample(
    X_train[numerical_features], y_calorie_train
)

oversampler_cluster = RandomOverSampler(random_state=42)
X_train_resampled_cluster, y_cluster_train_resampled = oversampler_cluster.fit_resample(
    X_train[numerical_features], y_cluster_train
)

# SVM for Calorie Level Prediction with hyperparameter tuning
svm_calorie = SVC(C=1.0, kernel='rbf', random_state=42)  # Add hyperparameters for tuning

# Fit the model using resampled data
svm_calorie.fit(X_train_resampled_calorie, y_calorie_train_resampled)

# Evaluate the model for Calorie Level Prediction using cross-validation
cv_accuracy_calorie = cross_val_score(
    svm_calorie, X_train_resampled_calorie, y_calorie_train_resampled, cv=5, scoring='accuracy'
)
print("Cross-Validation Accuracy for Calorie Level Prediction:", cv_accuracy_calorie.mean())

# Evaluate the model on the test set
y_calorie_pred = svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM for Nutritional Cluster Prediction with hyperparameter tuning
svm_cluster = SVC(C=1.0, kernel='rbf', random_state=42)  # Add hyperparameters for tuning

# Fit the model using resampled data for nutritional cluster prediction
svm_cluster.fit(X_train_resampled_cluster, y_cluster_train_resampled)

# Evaluate the model for Nutritional Cluster Prediction using cross-validation
cv_accuracy_cluster = cross_val_score(
    svm_cluster, X_train_resampled_cluster, y_cluster_train_resampled, cv=5, scoring='accuracy'
)
print("\nCross-Validation Accuracy for Nutritional Cluster Prediction:", cv_accuracy_cluster.mean())

# Evaluate the model on the test set
y_cluster_pred = svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

Cross-Validation Accuracy for Calorie Level Prediction: 0.40380166771272297
Accuracy for Calorie Level Prediction: 0.410933161304799

Classification Report for Calorie Level Prediction:
               precision    recall  f1-score   support

           0       0.48      0.54      0.51     13934
           1       0.39      0.30      0.34     12785
           2       0.32      0.36      0.34      8934

    accuracy                           0.41     35653
   macro avg       0.40      0.40      0.40     35653
weighted avg       0.41      0.41      0.41     35653


Confusion Matrix for Calorie Level Prediction:
 [[7585 3544 2805]
 [5033 3842 3910]
 [3166 2544 3224]]


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # Import oversampling library

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Check for class imbalance and apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_calorie_train_resampled = oversampler.fit_resample(
    X_train[numerical_features], y_calorie_train
)
y_cluster_train_resampled = oversampler.fit_resample(y_cluster_train)

# SVM Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_calorie = SVC(random_state=42)
grid_search_calorie = GridSearchCV(svm_calorie, param_grid, cv=5, scoring='accuracy')
grid_search_calorie.fit(X_train_resampled, y_calorie_train_resampled)

# Get the best estimator
best_svm_calorie = grid_search_calorie.best_estimator_

# Evaluate the model for Calorie Level Prediction using cross-validation
cv_accuracy_calorie = cross_val_score(
    best_svm_calorie, X_train_resampled, y_calorie_train_resampled, cv=5, scoring='accuracy'
)
print("Cross-Validation Accuracy for Calorie Level Prediction:", cv_accuracy_calorie.mean())

# Evaluate the model on the test set
y_calorie_pred = best_svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM Hyperparameter Tuning for Nutritional Cluster Prediction using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_cluster = SVC(random_state=42)
grid_search_cluster = GridSearchCV(svm_cluster, param_grid, cv=5, scoring='accuracy')
grid_search_cluster.fit(X_train_resampled, y_cluster_train_resampled)

# Get the best estimator
best_svm_cluster = grid_search_cluster.best_estimator_

# Evaluate the model for Nutritional Cluster Prediction using cross-validation
cv_accuracy_cluster = cross_val_score(
    best_svm_cluster, X_train_resampled, y_cluster_train_resampled, cv=5, scoring='accuracy'
)
print("\nCross-Validation Accuracy for Nutritional Cluster Prediction:", cv_accuracy_cluster.mean())

# Evaluate the model on the test set
y_cluster_pred = best_svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

TypeError: BaseSampler.fit_resample() missing 1 required positional argument: 'y'

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler  # Import oversampling library

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Check for class imbalance and apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_calorie_train_resampled = oversampler.fit_resample(
    X_train[numerical_features], y_calorie_train
)
X_train_resampled, y_cluster_train_resampled = oversampler.fit_resample(
    X_train_resampled, y_cluster_train
)

# SVM Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_calorie = SVC(random_state=42)
grid_search_calorie = GridSearchCV(svm_calorie, param_grid, cv=5, scoring='accuracy')
grid_search_calorie.fit(X_train_resampled, y_calorie_train_resampled)

# Get the best estimator
best_svm_calorie = grid_search_calorie.best_estimator_

# Evaluate the model for Calorie Level Prediction using cross-validation
cv_accuracy_calorie = cross_val_score(
    best_svm_calorie, X_train_resampled, y_calorie_train_resampled, cv=5, scoring='accuracy'
)
print("Cross-Validation Accuracy for Calorie Level Prediction:", cv_accuracy_calorie.mean())

# Evaluate the model on the test set
y_calorie_pred = best_svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM Hyperparameter Tuning for Nutritional Cluster Prediction using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_cluster = SVC(random_state=42)
grid_search_cluster = GridSearchCV(svm_cluster, param_grid, cv=5, scoring='accuracy')
grid_search_cluster.fit(X_train_resampled, y_cluster_train_resampled)

# Get the best estimator
best_svm_cluster = grid_search_cluster.best_estimator_

# Evaluate the model for Nutritional Cluster Prediction using cross-validation
cv_accuracy_cluster = cross_val_score(
    best_svm_cluster, X_train_resampled, y_cluster_train_resampled, cv=5, scoring='accuracy'
)
print("\nCross-Validation Accuracy for Nutritional Cluster Prediction:", cv_accuracy_cluster.mean())

# Evaluate the model on the test set
y_cluster_pred = best_svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

ValueError: Length mismatch: Expected axis has 210875 elements, new values have 235558 elements

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

# Load the data
data = pd.read_csv('nutritional_clustered_data.csv')

# Extract features (X) and target variables (y)
X = data[['name_tokens', 'ingredient_tokens', 'steps_tokens', 'techniques', 'ingredient_ids']]
y_calorie_level = data['calorie_level']
y_nutritional_cluster = data['nutritional_cluster']

# Tokenization and Preprocessing
X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))

# Feature Engineering
X['name_length'] = X['name_tokens'].apply(len)
X['ingredient_count'] = X['ingredient_tokens'].apply(len)
X['steps_length'] = X['steps_tokens'].apply(len)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['name_length', 'ingredient_count', 'steps_length']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split the data into training and testing sets
X_train, X_test, y_calorie_train, y_calorie_test, y_cluster_train, y_cluster_test = train_test_split(
    X, y_calorie_level, y_nutritional_cluster, test_size=0.2, random_state=42
)

# Check for class imbalance and apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_calorie_train_resampled = oversampler.fit_resample(
    X_train[numerical_features], y_calorie_train
)
_, y_cluster_train_resampled = oversampler.fit_resample(
    X_train[numerical_features], y_cluster_train
)

# SVM Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_calorie = SVC(random_state=42)
grid_search_calorie = GridSearchCV(svm_calorie, param_grid, cv=5, scoring='accuracy')
grid_search_calorie.fit(X_train_resampled, y_calorie_train_resampled)

# Get the best estimator
best_svm_calorie = grid_search_calorie.best_estimator_

# Evaluate the model for Calorie Level Prediction using cross-validation
cv_accuracy_calorie = cross_val_score(
    best_svm_calorie, X_train_resampled, y_calorie_train_resampled, cv=5, scoring='accuracy'
)
print("Cross-Validation Accuracy for Calorie Level Prediction:", cv_accuracy_calorie.mean())

# Evaluate the model on the test set
y_calorie_pred = best_svm_calorie.predict(X_test[numerical_features])
accuracy_calorie = accuracy_score(y_calorie_test, y_calorie_pred)

print("Accuracy for Calorie Level Prediction:", accuracy_calorie)
print("\nClassification Report for Calorie Level Prediction:\n", classification_report(y_calorie_test, y_calorie_pred))
print("\nConfusion Matrix for Calorie Level Prediction:\n", confusion_matrix(y_calorie_test, y_calorie_pred))

# SVM Hyperparameter Tuning for Nutritional Cluster Prediction using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_cluster = SVC(random_state=42)
grid_search_cluster = GridSearchCV(svm_cluster, param_grid, cv=5, scoring='accuracy')
grid_search_cluster.fit(X_train_resampled, y_cluster_train_resampled)

# Get the best estimator
best_svm_cluster = grid_search_cluster.best_estimator_

# Evaluate the model for Nutritional Cluster Prediction using cross-validation
cv_accuracy_cluster = cross_val_score(
    best_svm_cluster, X_train_resampled, y_cluster_train_resampled, cv=5, scoring='accuracy'
)
print("\nCross-Validation Accuracy for Nutritional Cluster Prediction:", cv_accuracy_cluster.mean())

# Evaluate the model on the test set
y_cluster_pred = best_svm_cluster.predict(X_test[numerical_features])
accuracy_cluster = accuracy_score(y_cluster_test, y_cluster_pred)

print("\nAccuracy for Nutritional Cluster Prediction:", accuracy_cluster)
print("\nClassification Report for Nutritional Cluster Prediction:\n", classification_report(y_cluster_test, y_cluster_pred))
print("\nConfusion Matrix for Nutritional Cluster Prediction:\n", confusion_matrix(y_cluster_test, y_cluster_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['name_tokens'] = X['name_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['ingredient_tokens'] = X['ingredient_tokens'].apply(lambda x: eval(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['steps_tokens'] = X['steps_tokens'].apply(lambda x: eval(x))
A value 

KeyboardInterrupt: 