In [1]:
import pandas as pd

In [2]:
data = pd.read_csv(r"C:\Users\mdhoz\Downloads\Allergen_Status_of_Food_Products.csv")


In [3]:
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

Dataset Preview:
          Food Product Main Ingredient Sweetener Fat/Oil Seasoning  \
0       Almond Cookies         Almonds     Sugar  Butter     Flour   
1       Almond Cookies         Almonds     Sugar  Butter     Flour   
2  Chicken Noodle Soup   Chicken broth       NaN     NaN      Salt   
3  Chicken Noodle Soup   Chicken broth       NaN     NaN      Salt   
4       Cheddar Cheese          Cheese       NaN     NaN      Salt   

                Allergens  Price ($)  Customer rating (Out of 5) Prediction  
0   Almonds, Wheat, Dairy      10.15                         3.1   Contains  
1   Almonds, Wheat, Dairy       6.17                         4.5   Contains  
2  Chicken, Wheat, Celery      19.65                         4.1   Contains  
3  Chicken, Wheat, Celery      17.48                         4.7   Contains  
4                   Dairy      10.83                         3.7   Contains  


In [4]:
# Display dataset info
print("\nDataset Information:")
print(data.info())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Food Product                399 non-null    object 
 1   Main Ingredient             399 non-null    object 
 2   Sweetener                   119 non-null    object 
 3   Fat/Oil                     341 non-null    object 
 4   Seasoning                   379 non-null    object 
 5   Allergens                   251 non-null    object 
 6   Price ($)                   399 non-null    float64
 7   Customer rating (Out of 5)  399 non-null    float64
 8   Prediction                  398 non-null    object 
dtypes: float64(2), object(7)
memory usage: 28.2+ KB
None


In [5]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
Food Product                    0
Main Ingredient                 0
Sweetener                     280
Fat/Oil                        58
Seasoning                      20
Allergens                     148
Price ($)                       0
Customer rating (Out of 5)      0
Prediction                      1
dtype: int64


In [6]:
# Check the percentage of missing values for each column
missing_percentage = data.isnull().mean() * 100
print("Percentage of Missing Values:\n", missing_percentage)

# Strategy: Drop columns with >50% missing values, fill others
threshold = 50  # Define a threshold for acceptable missing values
columns_to_drop = missing_percentage[missing_percentage > threshold].index
print("\nColumns to drop (more than 50% missing):", columns_to_drop)

# Drop columns with too many missing values
data.drop(columns=columns_to_drop, inplace=True)

# Fill missing values for numerical columns with mean
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Fill missing values for categorical columns with mode
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

# Verify no missing values remain
print("\nMissing values after handling:")
print(data.isnull().sum())

Percentage of Missing Values:
 Food Product                   0.000000
Main Ingredient                0.000000
Sweetener                     70.175439
Fat/Oil                       14.536341
Seasoning                      5.012531
Allergens                     37.092732
Price ($)                      0.000000
Customer rating (Out of 5)     0.000000
Prediction                     0.250627
dtype: float64

Columns to drop (more than 50% missing): Index(['Sweetener'], dtype='object')

Missing values after handling:
Food Product                  0
Main Ingredient               0
Fat/Oil                       0
Seasoning                     0
Allergens                     0
Price ($)                     0
Customer rating (Out of 5)    0
Prediction                    0
dtype: int64


In [8]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_cols)


Categorical Columns: Index([], dtype='object')


In [9]:
# Apply Label Encoding to categorical columns
label_encoders = {}  # Store encoders for potential decoding later
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save the encoder

In [10]:
# Verify encoding
print("\nEncoded Dataset Preview:")
print(data.head(100))


Encoded Dataset Preview:
    Food Product  Main Ingredient  Fat/Oil  Seasoning  Allergens  Price ($)  \
0              0                0        3         57          0      10.15   
1              0                0        3         57          0       6.17   
2             71               23       22        138          1      19.65   
3             71               23       22        138          1      17.48   
4             57               20       22        138          2      10.83   
..           ...              ...      ...        ...        ...        ...   
95           107               47       22        174          9       6.64   
96           101               63        3          9          9       6.14   
97           161               34        3        142         10      16.82   
98            96               34       22         22         10       8.21   
99            96               34       22         22         10       8.85   

    Customer rating (Out 

In [11]:
# Check data types to ensure all columns are numerical
print("\nData Types After Encoding:")
print(data.dtypes)


Data Types After Encoding:
Food Product                    int32
Main Ingredient                 int32
Fat/Oil                         int32
Seasoning                       int32
Allergens                       int32
Price ($)                     float64
Customer rating (Out of 5)    float64
Prediction                      int32
dtype: object


In [12]:
# Check class distribution in the target column
target_column = 'Prediction'  # Replace with the actual target column name
print("Class Distribution in Target Column:")
print(data[target_column].value_counts())

Class Distribution in Target Column:
Prediction
0    256
1    143
Name: count, dtype: int64


In [13]:
# Calculate scale_pos_weight
scale_pos_weight = len(data[data['Prediction'] == 0]) / len(data[data['Prediction'] == 1])
print(f"Scale Pos Weight: {scale_pos_weight}")

# Pass this parameter when initializing XGBoost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)

Scale Pos Weight: 1.7902097902097902


In [14]:
print("Data Types:")
print(data.dtypes)

# Ensure no object or string types remain in the dataset
assert data.select_dtypes(include=['object']).shape[1] == 0, "Categorical columns are still present!"


Data Types:
Food Product                    int32
Main Ingredient                 int32
Fat/Oil                         int32
Seasoning                       int32
Allergens                       int32
Price ($)                     float64
Customer rating (Out of 5)    float64
Prediction                      int32
dtype: object


In [15]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Apply scaling if necessary
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [16]:
# Detect outliers using IQR
for col in numerical_cols:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = data[(data[col] < Q1 - 1.5 * IQR) | (data[col] > Q3 + 1.5 * IQR)]
    print(f"Outliers detected in {col}: {len(outliers)}")

Outliers detected in Price ($): 0
Outliers detected in Customer rating (Out of 5): 0


In [17]:
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Drop features with correlation > 0.9
threshold = 0.9
correlated_features = set(
    correlation_matrix.columns[abs(correlation_matrix).gt(threshold).sum() > 1]
)
data = data.drop(columns=correlated_features)
print(f"Features dropped due to high correlation: {correlated_features}")

Correlation Matrix:
                            Food Product  Main Ingredient   Fat/Oil  \
Food Product                    1.000000         0.523192  0.175821   
Main Ingredient                 0.523192         1.000000  0.116703   
Fat/Oil                         0.175821         0.116703  1.000000   
Seasoning                       0.086062        -0.002207  0.241896   
Allergens                      -0.007044         0.077920 -0.411744   
Price ($)                      -0.010911        -0.074400  0.005767   
Customer rating (Out of 5)     -0.085068         0.004258 -0.072814   
Prediction                      0.048272        -0.094763  0.334646   

                            Seasoning  Allergens  Price ($)  \
Food Product                 0.086062  -0.007044  -0.010911   
Main Ingredient             -0.002207   0.077920  -0.074400   
Fat/Oil                      0.241896  -0.411744   0.005767   
Seasoning                    1.000000  -0.154692   0.015911   
Allergens                

In [18]:
X = data.drop(columns=['Prediction'])  # Replace 'Prediction' with your actual target column
y = data['Prediction']

print("Features Shape:", X.shape)
print("Target Shape:", y.shape)

Features Shape: (399, 7)
Target Shape: (399,)


In [20]:
# Save the processed dataset as a CSV file
processed_file_path = 'processed_dataset.csv'
data.to_csv(processed_file_path, index=False)
print(f"Processed dataset saved to {processed_file_path}")


Processed dataset saved to processed_dataset.csv
