## Import Library

In [None]:
import pandas as pd

## Gathering Data
The dataset was collected using a web scraper from the Glycemic Index Guide [website](https://glycemic-index.net/).

In [None]:
!wget --no-check-certificate \
  https://raw.githubusercontent.com/Glucofy-Team/Glucofy-Machine-Learning/main/data/nutrition%20food%20dataset.csv \
  -O /content/food.csv

--2024-05-24 15:18:26--  https://raw.githubusercontent.com/Glucofy-Team/Glucofy-Machine-Learning/main/data/nutrition%20food%20dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102371 (100K) [text/plain]
Saving to: ‘/content/food.csv’


2024-05-24 15:18:26 (4.07 MB/s) - ‘/content/food.csv’ saved [102371/102371]



In [None]:
df = pd.read_csv('/content/food.csv')

## Assessing Data
Check for missing values, duplicate data, and inaccurate values.

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web-scraper-order      586 non-null    object 
 1   web-scraper-start-url  586 non-null    object 
 2   category-href          586 non-null    object 
 3   category               586 non-null    object 
 4   name                   586 non-null    object 
 5   glycemic_index         586 non-null    int64  
 6   glycemic_load          586 non-null    float64
 7   calories (kcal)        586 non-null    int64  
 8   proteins (g)           586 non-null    float64
 9   carbohydrates (g)      586 non-null    float64
 10  fats (g)               586 non-null    float64
dtypes: float64(4), int64(2), object(5)
memory usage: 50.5+ KB


In [None]:
df.isna().sum()

web-scraper-order        0
web-scraper-start-url    0
category-href            0
category                 0
name                     0
glycemic_index           0
glycemic_load            0
calories (kcal)          0
proteins (g)             0
carbohydrates (g)        0
fats (g)                 0
dtype: int64

In [None]:
df.duplicated().sum()

0

In [None]:
df.describe()

Unnamed: 0,glycemic_index,glycemic_load,calories (kcal),proteins (g),carbohydrates (g),fats (g)
count,586.0,586.0,586.0,586.0,586.0,586.0
mean,41.298635,14.779181,217.59727,6.961792,28.504522,8.719266
std,24.131008,18.316097,180.102454,7.618162,27.074133,16.960519
min,0.0,0.0,2.0,0.0,0.0,0.0
25%,25.0,1.6,56.0,1.125,6.025,0.2
50%,40.0,7.0,187.0,3.65,17.1,1.45
75%,60.0,22.175,339.25,10.075,52.775,8.0
max,115.0,95.0,900.0,46.0,100.0,100.0


## Cleaning Data

- Drop unnecessary columns.

In [None]:
col = ['web-scraper-order', 'web-scraper-start-url', 'category-href']

df.drop(columns=col, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           586 non-null    object 
 1   name               586 non-null    object 
 2   glycemic_index     586 non-null    int64  
 3   glycemic_load      586 non-null    float64
 4   calories (kcal)    586 non-null    int64  
 5   proteins (g)       586 non-null    float64
 6   carbohydrates (g)  586 non-null    float64
 7   fats (g)           586 non-null    float64
dtypes: float64(4), int64(2), object(2)
memory usage: 36.8+ KB


- Change the data type of the numeric columns to `float64`.

In [None]:
col = ['glycemic_index', 'calories (kcal)']

df[col] = df[col].astype('float64')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586 entries, 0 to 585
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           586 non-null    object 
 1   name               586 non-null    object 
 2   glycemic_index     586 non-null    float64
 3   glycemic_load      586 non-null    float64
 4   calories (kcal)    586 non-null    float64
 5   proteins (g)       586 non-null    float64
 6   carbohydrates (g)  586 non-null    float64
 7   fats (g)           586 non-null    float64
dtypes: float64(6), object(2)
memory usage: 36.8+ KB


## Save Dataset

In [None]:
df.to_csv('(modified) nutrition food dataset.csv', index=False)