In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
Macros = pd.read_csv('macros_dataset.csv')
Macros.head()

Unnamed: 0,food_name,proteins_100g,carbohydrates_100g,fat_100g,energy_100g,category_name,origin,diet_type
0,Alaskan salmon,25.4,0.0,6.7,170.0,Fish,Animal,Omnivorous
1,Almonds,21.3,19.7,50.6,577.0,Nuts,Non-animal,Vegan
2,Amaranth,14.5,66.2,6.5,374.0,Grains,Non-animal,Vegan
3,Anchovies,29.0,0.0,9.7,210.0,Fish,Animal,Omnivorous
4,Asparagus,2.4,4.1,0.2,22.0,Vegetables,Non-animal,Vegan


In [3]:
Macros.shape

(60, 8)

In [4]:
len(Macros)

60

In [5]:
Macros.describe()

Unnamed: 0,proteins_100g,carbohydrates_100g,fat_100g,energy_100g
count,60.0,60.0,60.0,60.0
mean,19.263333,10.483333,13.256667,231.566667
std,10.176093,15.565348,16.035473,163.310935
min,2.0,0.0,0.2,22.0
25%,12.475,0.375,1.975,114.0
50%,21.0,4.05,6.45,164.0
75%,25.175,15.825,20.925,314.25
max,57.5,67.4,65.2,653.0


In [7]:
Macros.isna().sum()

food_name             0
proteins_100g         0
carbohydrates_100g    0
fat_100g              0
energy_100g           0
category_name         0
origin                0
diet_type             0
dtype: int64

In [9]:
Macros['origin'].unique()

array(['Animal', 'Non-animal'], dtype=object)

In [10]:
# Create X (all the feature columns)
x = Macros.drop('origin', axis=1)
# Create y (the target column)
y = Macros['origin']

In [11]:
x.shape

(60, 7)

In [12]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)

In [16]:
x.columns

Index(['food_name', 'proteins_100g', 'carbohydrates_100g', 'fat_100g',
       'energy_100g', 'category_name', 'diet_type'],
      dtype='object')

In [17]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['food_name', 'proteins_100g', 'carbohydrates_100g', 'fat_100g',
       'energy_100g', 'category_name', 'diet_type']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")
transformed_X = transformer.fit_transform(x)
transformed_X

<60x280 sparse matrix of type '<class 'numpy.float64'>'
	with 420 stored elements in Compressed Sparse Row format>

In [18]:
transformed_X[0]

<1x280 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [19]:
pd.DataFrame(transformed_X)

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 98)\t1.0\n (0, 110)\t1.0..."
1,"(0, 1)\t1.0\n (0, 88)\t1.0\n (0, 140)\t1.0..."
2,"(0, 2)\t1.0\n (0, 76)\t1.0\n (0, 151)\t1.0..."
3,"(0, 3)\t1.0\n (0, 104)\t1.0\n (0, 110)\t1...."
4,"(0, 4)\t1.0\n (0, 62)\t1.0\n (0, 125)\t1.0..."
5,"(0, 5)\t1.0\n (0, 60)\t1.0\n (0, 133)\t1.0..."
6,"(0, 6)\t1.0\n (0, 61)\t1.0\n (0, 147)\t1.0..."
7,"(0, 7)\t1.0\n (0, 69)\t1.0\n (0, 143)\t1.0..."
8,"(0, 8)\t1.0\n (0, 62)\t1.0\n (0, 131)\t1.0..."
9,"(0, 9)\t1.0\n (0, 82)\t1.0\n (0, 111)\t1.0..."


In [20]:
# Let's refit the model
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                    y,
                                                    test_size=0.2)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

1.0

In [21]:
# Make predictions
y_preds = model.predict(X_test)

In [22]:
# Make predictions with probabilities
y_probs = model.predict_proba(X_test)

y_probs[:10], len(y_probs)

(array([[1.  , 0.  ],
        [0.17, 0.83],
        [0.97, 0.03],
        [0.06, 0.94],
        [0.99, 0.01],
        [0.16, 0.84],
        [0.09, 0.91],
        [0.18, 0.82],
        [0.13, 0.87],
        [0.98, 0.02]]),
 12)

In [23]:
y_probs_positive = y_probs[:10]
y_probs_positive[:10]

array([[1.  , 0.  ],
       [0.17, 0.83],
       [0.97, 0.03],
       [0.06, 0.94],
       [0.99, 0.01],
       [0.16, 0.84],
       [0.09, 0.91],
       [0.18, 0.82],
       [0.13, 0.87],
       [0.98, 0.02]])

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

      Animal       1.00      1.00      1.00         5
  Non-animal       1.00      1.00      1.00         7

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12



In [25]:
# Evaluating the model
print(f'training accuaracy: {model.score(X_train,y_train)}')

training accuaracy: 1.0


In [26]:
# Evaluating the model
print(f'testing accuaracy: {model.score(X_train,y_train)}')

testing accuaracy: 1.0


In [27]:
# Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {model.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 100.00%

Trying model with 20 estimators...
Model accuracy on test set: 100.00%

Trying model with 30 estimators...
Model accuracy on test set: 100.00%

Trying model with 40 estimators...
Model accuracy on test set: 100.00%

Trying model with 50 estimators...
Model accuracy on test set: 100.00%

Trying model with 60 estimators...
Model accuracy on test set: 100.00%

Trying model with 70 estimators...
Model accuracy on test set: 100.00%

Trying model with 80 estimators...
Model accuracy on test set: 100.00%

Trying model with 90 estimators...
Model accuracy on test set: 100.00%

