In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [3]:
# Load the dataset
data = pd.read_csv(r"C:\Users\mdhoz\Downloads\Allergen_Status_of_Food_Products.csv")


In [4]:
# Ensure the dataset has the required columns
print("Columns in dataset:", data.columns)

Columns in dataset: Index(['Food Product', 'Main Ingredient', 'Sweetener', 'Fat/Oil', 'Seasoning',
       'Allergens', 'Price ($)', 'Customer rating (Out of 5)', 'Prediction'],
      dtype='object')


In [5]:
# Check the percentage of missing values for each column
missing_percentage = data.isnull().mean() * 100
print("Percentage of Missing Values:\n", missing_percentage)


Percentage of Missing Values:
 Food Product                   0.000000
Main Ingredient                0.000000
Sweetener                     70.175439
Fat/Oil                       14.536341
Seasoning                      5.012531
Allergens                     37.092732
Price ($)                      0.000000
Customer rating (Out of 5)     0.000000
Prediction                     0.250627
dtype: float64


In [7]:
# Strategy: Drop columns with >50% missing values, fill others
threshold = 50  # Define a threshold for acceptable missing values
columns_to_drop = missing_percentage[missing_percentage > threshold].index
print("\nColumns to drop (more than 50% missing):", columns_to_drop)
# Drop columns with too many missing values
data.drop(columns=columns_to_drop, inplace=True)


Columns to drop (more than 50% missing): Index(['Sweetener'], dtype='object')


In [8]:
# Fill missing values for numerical columns with mean
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Fill missing values for categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Verify no missing values remain
print("\nMissing values after handling:")
print(data.isnull().sum())



Missing values after handling:
Food Product                  0
Main Ingredient               0
Fat/Oil                       0
Seasoning                     0
Allergens                     0
Price ($)                     0
Customer rating (Out of 5)    0
Prediction                    0
dtype: int64


In [13]:

# Split into train and test datasets
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Map the Prediction column to numerical values
if 'Prediction' in train.columns:
    train['Prediction'] = train['Prediction'].map({'Contains': 1, 'Does not contain': 0})
    test['Prediction'] = test['Prediction'].map({'Contains': 1, 'Does not contain': 0})
else:
    print("Error: 'Prediction' column is missing.")
# Identify categorical columns in the training data
categorical_columns_train = train.select_dtypes(include=['object']).columns

In [16]:
pip install category-encoders

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting category-encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
   ---------------------------------------- 0.0/82.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/82.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/82.0 kB ? eta -:--:--
   -------------- ------------------------- 30.7/82.0 kB 217.9 kB/s eta 0:00:01
   -------------- ------------------------- 30.7/82.0 kB 217.9 kB/s eta 0:00:01
   ---------------------------------------  81.9/82.0 kB 327.3 kB/s eta 0:00:01
   ---------------------------------------- 82.0/82.0 kB 255.4 kB/s eta 0:00:00
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.4


In [17]:
from category_encoders import LeaveOneOutEncoder
# Initialize the Leave-One-Out Encoder
encoder = LeaveOneOutEncoder(cols=categorical_columns_train)

# Fit the encoder on the training data and transform it
train_encoded = encoder.fit_transform(train[categorical_columns_train], train["Prediction"])

# Transform the test data using the fitted encoder
test_encoded = encoder.transform(test[categorical_columns_train])

# Add the encoded columns back to the original datasets
train = pd.concat([train.drop(categorical_columns_train, axis=1), train_encoded], axis=1)
test = pd.concat([test.drop(categorical_columns_train, axis=1), test_encoded], axis=1)

# Check the resulting datasets
print("Training set shape:", train.shape)
print("Test set shape:", test.shape)

Training set shape: (319, 8)
Test set shape: (80, 8)


In [18]:
test

Unnamed: 0,Price ($),Customer rating (Out of 5),Prediction,Food Product,Main Ingredient,Fat/Oil,Seasoning,Allergens
198,6.26,3.0,0,0.62069,0.533333,0.478992,0.620690,0.345946
349,11.87,1.3,1,1.00000,0.666667,0.927536,1.000000,1.000000
33,8.67,4.9,1,0.62069,1.000000,0.927536,0.620690,0.345946
208,5.65,3.9,0,0.62069,0.000000,0.478992,0.250000,0.345946
93,13.79,2.2,1,0.62069,0.620690,0.478992,0.620690,0.620690
...,...,...,...,...,...,...,...,...
249,7.87,4.3,0,0.62069,0.400000,0.478992,0.620690,0.345946
225,6.28,3.1,0,0.62069,0.533333,0.478992,1.000000,0.345946
368,5.16,1.3,1,1.00000,0.545455,0.478992,1.000000,1.000000
175,10.32,3.8,0,0.62069,0.666667,0.478992,0.366667,0.345946


In [19]:
# Split features and target variable for training and test sets
X_train = train.drop(columns=['Prediction'])  # Features
y_train = train['Prediction']                # Target

X_test = test.drop(columns=['Prediction'])   # Features
y_test = test['Prediction']                  # Target

# Verify the shapes of features and targets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (319, 7)
y_train shape: (319,)
X_test shape: (80, 7)
y_test shape: (80,)


In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [21]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the model on the training data
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [22]:
# Make predictions on training and test sets
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification reports
print("\nClassification Report - Train:\n", classification_report(y_train, y_train_pred))
print("\nClassification Report - Test:\n", classification_report(y_test, y_test_pred))

# Confusion matrices
print("\nConfusion Matrix - Train:\n", confusion_matrix(y_train, y_train_pred))
print("\nConfusion Matrix - Test:\n", confusion_matrix(y_test, y_test_pred))

Train Accuracy: 1.0000
Test Accuracy: 0.7250

Classification Report - Train:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       1.00      1.00      1.00       198

    accuracy                           1.00       319
   macro avg       1.00      1.00      1.00       319
weighted avg       1.00      1.00      1.00       319


Classification Report - Test:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.72      1.00      0.84        58

    accuracy                           0.72        80
   macro avg       0.36      0.50      0.42        80
weighted avg       0.53      0.72      0.61        80


Confusion Matrix - Train:
 [[121   0]
 [  0 198]]

Confusion Matrix - Test:
 [[ 0 22]
 [ 0 58]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation Accuracy: 1.0000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
                           param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-validation Accuracy: 1.0


In [26]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}

# Perform Grid Search CV
grid_search = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

grid_search.fit(X_train, y_train_encoded)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best Cross-validation Accuracy: 1.0


In [27]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the training set
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate accuracy for training and test sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification reports for detailed performance metrics
print("\nClassification Report - Train:\n", classification_report(y_train, y_train_pred))
print("\nClassification Report - Test:\n", classification_report(y_test, y_test_pred))

# Confusion matrices for training and test sets
print("\nConfusion Matrix - Train:\n", confusion_matrix(y_train, y_train_pred))
print("\nConfusion Matrix - Test:\n", confusion_matrix(y_test, y_test_pred))

Train Accuracy: 1.0000
Test Accuracy: 0.7250

Classification Report - Train:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       121
           1       1.00      1.00      1.00       198

    accuracy                           1.00       319
   macro avg       1.00      1.00      1.00       319
weighted avg       1.00      1.00      1.00       319


Classification Report - Test:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.72      1.00      0.84        58

    accuracy                           0.72        80
   macro avg       0.36      0.50      0.42        80
weighted avg       0.53      0.72      0.61        80


Confusion Matrix - Train:
 [[121   0]
 [  0 198]]

Confusion Matrix - Test:
 [[ 0 22]
 [ 0 58]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')

# Print the mean and standard deviation of cross-validation scores
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation accuracy: 1.0000 ± 0.0000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [29]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb_model, 'ngsxgb_model.pkl')

# Save the Leave-One-Out Encoder
joblib.dump(encoder, 'ngsloo_encoder.pkl')
print("model abd encoder saved succesfully")

model abd encoder saved succesfully


In [30]:
# Get feature importance as a dataframe
importance = xgb_model.get_booster().get_score(importance_type='weight')

# Convert to a dataframe for easier viewing
import pandas as pd
importance_df = pd.DataFrame(importance.items(), columns=['Feature', 'Importance'])
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 features
print(importance_df.head(10))

                      Feature  Importance
2             Main Ingredient        76.0
5                   Allergens        30.0
1  Customer rating (Out of 5)         4.0
3                     Fat/Oil         3.0
4                   Seasoning         2.0
0                   Price ($)         1.0
