In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("preprocessed_data.csv")

In [3]:
# Split data into features and target variable (if applicable)
X = df.drop(columns=['Is_Allergen'])
y = df['Is_Allergen']

# Shape of features and target variables
print("Shape of features variable:",X.shape)
print("Shape of target variable:",y.shape)
print()

# Split the data into train and test sets 80:20 ratio
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)

# Displaying the shapes
print("Shape of variable in 80:20 split:")
print("\tShape of X_train variable:",X_train_80.shape)
print("\tShape of X_test variable:",X_test_80.shape)
print("\tShape of y_train variable:",y_train_80.shape)
print("\tShape of y_test variable:",y_test_80.shape)
print()

# Split the data into train and test sets 70:30 ratio
X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42)

# Displaying the shapes
print("Shape of variable in 70:30 split")
print("\tShape of X_train variable:",X_train_70.shape)
print("\tShape of X_test variable:",X_test_70.shape)
print("\tShape of y_train variable:",y_train_70.shape)
print("\tShape of y_test variable:",y_test_70.shape)


Shape of features variable: (398, 8)
Shape of target variable: (398,)

Shape of variable in 80:20 split:
	Shape of X_train variable: (318, 8)
	Shape of X_test variable: (80, 8)
	Shape of y_train variable: (318,)
	Shape of y_test variable: (80,)

Shape of variable in 70:30 split
	Shape of X_train variable: (278, 8)
	Shape of X_test variable: (120, 8)
	Shape of y_train variable: (278,)
	Shape of y_test variable: (120,)


In [4]:
### LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Applying Scaling on the Dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_70)
X_test_scaled = scaler.transform(X_test_70)

# LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train_70)

# Prediction
y_pred = lr.predict(X_test_scaled)
y_train_pred = lr.predict(X_train_scaled)

# Test Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("Logistic Regression model test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("Logistic Regression model train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))

Logistic Regression model test accuracy (in %): 61.66666666666667
Logistic Regression model train accuracy (in %): 58.63309352517986

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.84      0.76        87
           1       0.07      0.03      0.04        33

    accuracy                           0.62       120
   macro avg       0.38      0.43      0.40       120
weighted avg       0.52      0.62      0.56       120


Confusion Matrix:
[[73 14]
 [32  1]]


In [5]:
### DECISION TREE

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_70, y_train_70)

# Prediction
y_pred = dt.predict(X_test_70)
y_train_pred = dt.predict(X_train_70)

# Test Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("Decision Tree model test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("Decision Tree model train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))

Decision Tree model test accuracy (in %): 95.0
Decision Tree model train accuracy (in %): 100.0

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        87
           1       0.94      0.88      0.91        33

    accuracy                           0.95       120
   macro avg       0.95      0.93      0.94       120
weighted avg       0.95      0.95      0.95       120


Confusion Matrix:
[[85  2]
 [ 4 29]]


In [6]:
### RANDOM FOREST


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_70, y_train_70)

# Prediction
y_pred = rf.predict(X_test_70)
y_train_pred = rf.predict(X_train_70)

# Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("Random Forest model test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("Random Forest model train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))


Random Forest model test accuracy (in %): 98.33333333333333
Random Forest model train accuracy (in %): 100.0

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        87
           1       0.97      0.97      0.97        33

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120


Confusion Matrix:
[[86  1]
 [ 1 32]]


In [7]:
### XGBOOST

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train_70, y_train_70)

# Prediction
y_pred = xgb.predict(X_test_70)
y_train_pred = xgb.predict(X_train_70)

# Test Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("XGBoost model test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("XGBoost model train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))


XGBoost model test accuracy (in %): 98.33333333333333
XGBoost model train accuracy (in %): 100.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        87
           1       0.94      1.00      0.97        33

    accuracy                           0.98       120
   macro avg       0.97      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120


Confusion Matrix:
[[85  2]
 [ 0 33]]


In [8]:
### KNN



from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)  # Optional: adjust n_neighbors as needed
knn.fit(X_train_70, y_train_70)

# Prediction
y_pred = knn.predict(X_test_70)
y_train_pred = knn.predict(X_train_70)

# Test Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("KNN model Test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("KNN model Train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))

KNN model Test accuracy (in %): 62.5
KNN model Train accuracy (in %): 72.66187050359713

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.60      0.70        87
           1       0.40      0.70      0.51        33

    accuracy                           0.62       120
   macro avg       0.62      0.65      0.60       120
weighted avg       0.72      0.62      0.65       120


Confusion Matrix:
[[52 35]
 [10 23]]


In [9]:
### NAVIE BAYES


from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Gaussian Naive Bayes
nb = GaussianNB()
nb.fit(X_train_70, y_train_70)

# Prediction
y_pred = nb.predict(X_test_70)
y_train_pred = nb.predict(X_train_70)

# Test Accuracy
acc = accuracy_score(y_test_70, y_pred)
print("Naive Bayes model Test accuracy (in %):", acc * 100)

# Train Accuracy
acc = accuracy_score(y_train_70, y_train_pred)
print("Naive Bayes model Train accuracy (in %):", acc * 100)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_70, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_70, y_pred))

Naive Bayes model Test accuracy (in %): 88.33333333333333
Naive Bayes model Train accuracy (in %): 89.92805755395683

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        87
           1       0.72      0.94      0.82        33

    accuracy                           0.88       120
   macro avg       0.85      0.90      0.87       120
weighted avg       0.90      0.88      0.89       120


Confusion Matrix:
[[75 12]
 [ 2 31]]


In [10]:
### REFINE TUNING FOR XGBOOST



from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.3, 0.6, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Initialize the model
xgb = XGBClassifier()

# RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid,
                                   n_iter=50, scoring='accuracy', cv=3, verbose=1, n_jobs=-1, random_state=42)

# Fit the model
random_search.fit(X_train_70, y_train_70)

# Best parameters
print("Best Parameters from Randomized Search:", random_search.best_params_)

# Best model
best_xgb = random_search.best_estimator_

# Predictions with the best model
y_pred_best = best_xgb.predict(X_test_70)
y_train_pred_best = best_xgb.predict(X_train_70)

# Test Accuracy with the best model
acc_best = accuracy_score(y_test_70, y_pred_best)
print("Tuned XGBoost model test accuracy (in %):", acc_best * 100)

# Train Accuracy with the best model
acc_best = accuracy_score(y_train_70, y_train_pred_best)
print("Tuned XGBoost model train accuracy (in %):", acc_best * 100)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters from Randomized Search: {'subsample': 1.0, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 1.0}
Tuned XGBoost model test accuracy (in %): 98.33333333333333
Tuned XGBoost model train accuracy (in %): 99.64028776978418


In [11]:
import joblib
# Save the XGBoost model with joblib
joblib.dump(best_xgb, 'Allergen_detection.pkl')


['Allergen_detection.pkl']