In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/mock_milk_quality.csv")

# Preview the data
df.head()

# Check dataset structure
df.info()

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Encode 'Quality' into numeric labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Quality_Label'] = le.fit_transform(df['Quality'])  # 'High' = 0 or 1 depending on mapping

# Preview updated data
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pH            10 non-null     float64
 1   Temperature   10 non-null     float64
 2   Fat_Content   10 non-null     float64
 3   Conductivity  10 non-null     float64
 4   Density       10 non-null     float64
 5   Quality       10 non-null     object 
dtypes: float64(5), object(1)
memory usage: 612.0+ bytes

Missing values:
 pH              0
Temperature     0
Fat_Content     0
Conductivity    0
Density         0
Quality         0
dtype: int64


Unnamed: 0,pH,Temperature,Fat_Content,Conductivity,Density,Quality,Quality_Label
0,6.5,34.5,3.5,4.5,1.03,High,0
1,6.7,35.2,3.8,4.6,1.032,High,0
2,6.3,36.0,3.2,4.4,1.028,Low,1
3,6.8,34.8,4.0,4.7,1.033,High,0
4,7.0,35.5,3.9,4.8,1.035,High,0


In [3]:
# Prepare features (X) and labels (y)
X = df.drop(['Quality', 'Quality_Label'], axis=1)
y = df['Quality_Label']

# Split into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check dimensions
X_train.shape, X_test.shape


((8, 5), (2, 5))

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
 [[2]]




In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
 [[1 0]
 [0 1]]


In [7]:
import joblib

# Save the trained model
joblib.dump(model, "../ml_model/dairy_model.pkl")
print("Model saved to ml_model/dairy_model.pkl")

Model saved to ml_model/dairy_model.pkl
