In [1]:
import pandas as pd
import numpy as np
import seaborn as snsa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('milknew.csv')
print(df)

       pH  Temprature  Taste  Odor  Fat   Turbidity  Colour   Grade
0     6.6          35      1     0     1          0     254    high
1     6.6          36      0     1     0          1     253    high
2     8.5          70      1     1     1          1     246     low
3     9.5          34      1     1     0          1     255     low
4     6.6          37      0     0     0          0     255  medium
...   ...         ...    ...   ...   ...        ...     ...     ...
1054  6.7          45      1     1     0          0     247  medium
1055  6.7          38      1     0     1          0     255    high
1056  3.0          40      1     1     1          1     255     low
1057  6.8          43      1     0     1          0     250    high
1058  8.6          55      0     1     1          1     255     low

[1059 rows x 8 columns]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pH          1059 non-null   float64
 1   Temprature  1059 non-null   int64  
 2   Taste       1059 non-null   int64  
 3   Odor        1059 non-null   int64  
 4   Fat         1059 non-null   int64  
 5   Turbidity   1059 non-null   int64  
 6   Colour      1059 non-null   int64  
 7   Grade       1059 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 66.3+ KB


In [4]:
df.describe()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour
count,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0,1059.0
mean,6.630123,44.226629,0.546742,0.432483,0.671388,0.491029,251.840415
std,1.399679,10.098364,0.498046,0.495655,0.46993,0.500156,4.307424
min,3.0,34.0,0.0,0.0,0.0,0.0,240.0
25%,6.5,38.0,0.0,0.0,0.0,0.0,250.0
50%,6.7,41.0,1.0,0.0,1.0,0.0,255.0
75%,6.8,45.0,1.0,1.0,1.0,1.0,255.0
max,9.5,90.0,1.0,1.0,1.0,1.0,255.0


In [5]:
df['Grade'].unique()

array(['high', 'low', 'medium'], dtype=object)

In [6]:
mapping = {"high":1,'low':2,'medium':3}
df["Grade"] = df["Grade"].map(mapping)

In [7]:
df['Grade'].unique()

array([1, 2, 3], dtype=int64)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pH          1059 non-null   float64
 1   Temprature  1059 non-null   int64  
 2   Taste       1059 non-null   int64  
 3   Odor        1059 non-null   int64  
 4   Fat         1059 non-null   int64  
 5   Turbidity   1059 non-null   int64  
 6   Colour      1059 non-null   int64  
 7   Grade       1059 non-null   int64  
dtypes: float64(1), int64(7)
memory usage: 66.3 KB


In [9]:
df.corr()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade
pH,1.0,0.244684,-0.064053,-0.081331,-0.093429,0.048384,-0.164565,-0.012518
Temprature,0.244684,1.0,-0.109792,-0.04887,0.024073,0.185106,-0.008511,-0.094232
Taste,-0.064053,-0.109792,1.0,0.017582,0.324149,0.055755,-0.082654,-0.197629
Odor,-0.081331,-0.04887,0.017582,1.0,0.314505,0.457935,-0.039361,-0.454609
Fat,-0.093429,0.024073,0.324149,0.314505,1.0,0.329264,0.114151,-0.50107
Turbidity,0.048384,0.185106,0.055755,0.457935,0.329264,1.0,0.136436,-0.428017
Colour,-0.164565,-0.008511,-0.082654,-0.039361,0.114151,0.136436,1.0,-0.212994
Grade,-0.012518,-0.094232,-0.197629,-0.454609,-0.50107,-0.428017,-0.212994,1.0


In [10]:
df.columns

Index(['pH', 'Temprature', 'Taste', 'Odor', 'Fat ', 'Turbidity', 'Colour',
       'Grade'],
      dtype='object')

In [11]:
X = df.drop('Grade', axis=1)  # Features
y = df['Grade']  # Target variable

In [12]:
X.head()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour
0,6.6,35,1,0,1,0,254
1,6.6,36,0,1,0,1,253
2,8.5,70,1,1,1,1,246
3,9.5,34,1,1,0,1,255
4,6.6,37,0,0,0,0,255


In [13]:
y.head()

0    1
1    1
2    2
3    2
4    3
Name: Grade, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42)

In [24]:
rf_classifier = RandomForestClassifier(n_estimators=5, random_state=42)
rf_classifier.fit(X_train, y_train)

In [25]:
y_pred = rf_classifier.predict(X_test)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00
