In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

data = pd.read_csv('C:/User/kgonzales21/Downloads/IE things/Datasets/Milk Condition Dataset.csv')
data.head()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade
0,6.6,35,1,0,1,0,254,high
1,6.6,36,0,1,0,1,253,high
2,8.5,70,1,1,1,1,246,low
3,9.5,34,1,1,0,1,255,low
4,6.6,37,0,0,0,0,255,medium


In [2]:
data_mod = data.rename(columns = {'Temprature':'Temperature'})
data_mod.columns

Index(['pH', 'Temperature', 'Taste', 'Odor', 'Fat ', 'Turbidity', 'Colour',
       'Grade'],
      dtype='object')

In [3]:
print("")
print("---Info---")
print("")
print(data_mod.info())
print("")
print("---Data Shape---")
print("")
print(data_mod.shape)


---Info---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pH           1059 non-null   float64
 1   Temperature  1059 non-null   int64  
 2   Taste        1059 non-null   int64  
 3   Odor         1059 non-null   int64  
 4   Fat          1059 non-null   int64  
 5   Turbidity    1059 non-null   int64  
 6   Colour       1059 non-null   int64  
 7   Grade        1059 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 66.3+ KB
None

---Data Shape---

(1059, 8)


In [4]:
data_mod['Grade'].value_counts()

low       429
medium    374
high      256
Name: Grade, dtype: int64

In [5]:
high_grade = data_mod[data_mod['Grade'] == 'high']['Grade']
print(f"High grade: {len(high_grade)}")
medium_grade = data_mod[data_mod['Grade'] == 'medium']['Grade']
print(f"Medium Grade: {len(medium_grade)}")
low_grade = data_mod[data_mod['Grade'] == 'low']['Grade']
print(f"Low Grade: {len(low_grade)}")

High grade: 256
Medium Grade: 374
Low Grade: 429


In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_mod, test_size=0.20, random_state=45)

In [7]:
high = train[train['Grade'] == 'high']
medium = train[train['Grade'] == 'medium']
low = train[train['Grade'] == 'low']
medium_shrunk = medium[:len(high)]
low_shrunk = low[:len(high)]
Final_train = pd.concat([high, medium_shrunk, low_shrunk], axis=0)
print(f"-- High Grade: {len(high)}", f"-- Medium grade: {len(medium_shrunk)}", f"-- Low Grade: {len(low_shrunk)} --")
print(f"Shape: {Final_train.shape}")
Final_train.head()
Final_shuffled = Final_train.sample(frac=1)
Final_shuffled.head()

-- High Grade: 206 -- Medium grade: 206 -- Low Grade: 206 --
Shape: (618, 8)


Unnamed: 0,pH,Temperature,Taste,Odor,Fat,Turbidity,Colour,Grade
906,9.0,43,1,0,1,1,250,low
239,6.8,41,0,0,1,0,255,medium
985,6.5,45,1,0,0,0,246,medium
750,6.8,40,1,1,1,1,255,high
166,6.7,50,1,1,1,0,245,low


In [8]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
Final_train_features = std_scaler.fit_transform(Final_shuffled.select_dtypes(include=['int64', 'float64']))
Final_train_features

array([[ 1.87778211, -0.06228104,  0.88955289, ...,  0.61600673,
         1.0329035 , -0.45832321],
       [ 0.11719142, -0.27493123, -1.12416025, ...,  0.61600673,
        -0.96814466,  0.72087638],
       [-0.12288913,  0.15036914,  0.88955289, ..., -1.6233589 ,
        -0.96814466, -1.40168288],
       ...,
       [-0.12288913, -0.80655669, -1.12416025, ...,  0.61600673,
        -0.96814466,  0.72087638],
       [ 0.03716457, -0.5939065 ,  0.88955289, ...,  0.61600673,
        -0.96814466,  0.72087638],
       [ 0.59735252,  2.27687099, -1.12416025, ..., -1.6233589 ,
        -0.96814466,  0.72087638]])

In [9]:
Final_train_label = Final_shuffled.select_dtypes(include=['O'])
Final_train_label.head()

Unnamed: 0,Grade
906,low
239,medium
985,medium
750,high
166,low


In [10]:
from sklearn.tree import DecisionTreeClassifier
clf_dec_tree = DecisionTreeClassifier() 
clf_dec_tree.fit(Final_train_features, Final_train_label)

DecisionTreeClassifier()

In [11]:
sample = Final_train_features[0]
sample

array([ 1.87778211, -0.06228104,  0.88955289, -0.9221168 ,  0.61600673,
        1.0329035 , -0.45832321])

In [12]:
sample_scores = clf_dec_tree.predict([sample])
sample_scores

array(['low'], dtype=object)

In [13]:
clf_dec_tree.predict_proba([sample])

array([[0., 1., 0.]])

In [14]:
from sklearn.metrics import f1_score

print(f"F1 Score: {f1_score(Final_train_label, clf_dec_tree.predict(Final_train_features), average=None, labels=['high', 'medium', 'low'])}")
print(f"Mean Score: {clf_dec_tree.score(Final_train_features, Final_train_label)}")

F1 Score: [1. 1. 1.]
Mean Score: 1.0


In [15]:
from sklearn.model_selection import cross_val_predict

y_train_predictions = cross_val_predict(clf_dec_tree, Final_train_features, Final_train_label, cv=3)
y_train_predictions[:5]

array(['low', 'medium', 'medium', 'high', 'low'], dtype=object)

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Final_train_label, y_train_predictions)

array([[203,   0,   3],
       [  1, 205,   0],
       [  0,   0, 206]], dtype=int64)

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf_dec_tree, Final_train_features, Final_train_label, cv=3, scoring="accuracy")

array([0.99514563, 0.99029126, 0.99514563])

In [24]:
print(f"F1 Score: {f1_score(Final_train_label, y_train_predictions, average=None, labels=['high', 'medium', 'low'])}")

F1 Score: [0.9902439  0.99277108 0.99756691]
