<a href="https://colab.research.google.com/github/MuhammadHaris78/food-waste-analysis/blob/main/Food_Waste_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Objective:**

The objective of this Project is to analyze a food service dataset to gain insights into operational efficiency and food waste management. The dataset consists of variables such as the number of meals served, kitchen sta, environmental conditions (temperature and humidity), and food waste. Our goal is to explore this data, clean it, visualize key patterns, and derive actionable insights to optimize operations.

# **1. Uploading the food data dataframe.**


In [None]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Uploading the dataframe
df = pd.read_csv("/content/drive/MyDrive/Classroom/Food data.csv")
display(df.head())
print(df.shape)

# **2. Data Cleaning**

In [None]:
# Dropping the 'ID' column as date is the unique identifier.
df.drop(columns=['ID'], inplace=True)
df.head(5)

In [None]:
# Finding the duplicate rows.
df.duplicated().sum()

In [None]:
# Removing the duplicate rows.
df.drop_duplicates()

In [None]:
# Cheking for the missing values and their percentage in each column.
print("Missing values in each column:\n",df.isnull().sum())
print("-------------------")
missing_values = df.isnull().sum()
missing_percentage = (missing_values/len(df))*100
print("Missing Values Percenatge:\n",missing_percentage)

In [None]:
# Checking the categorial columns and handling their missing values.
for col in df:
  print(f"\nColumn {col}: ")
  print(df[col].nunique())
  print(df[col].value_counts())
  print("---------------------------------")

In [None]:
# Replacing non-numeric entries with numeric.

# Changing dtype of 'kitchen_staff' into str to make 'ten' and 'eleven' count
# and then converting it into numeric integers.
df['kitchen_staff']=df['kitchen_staff'].astype(str).str.strip().replace({'ten':'10','eleven':'11'})
df['kitchen_staff'] = pd.to_numeric(df['kitchen_staff'], errors='coerce').astype('Int64')
print(df['kitchen_staff'].value_counts())
print("-------------------------------------------")

# The 'special_event' doesn't have any missing values so it can be directly
# converted back into int.
df['special_event'] =df['special_event'].astype(str).str.strip().replace({'One':1}).astype(int)
print(df['special_event'].value_counts())
print("-------------------------------------------")

# Capitalizing the first letter each category in 'staff_experience' to make
# 'intermediate' as one.
df['staff_experience']=df['staff_experience'].str.capitalize()
print(df['staff_experience'].value_counts())
print("-------------------------------------------")

# Lowering the each category in 'waste_category' to make similar cat as one.
df['waste_category']=df['waste_category'].str.lower()
print(df['waste_category'].value_counts())
print("-------------------------------------------")
# Checking the changed data types.
print(df.dtypes)

In [None]:
# Imputing the missing values in categorical columns with mode.
miss_val_categ_columns = ["kitchen_staff","staff_experience","waste_category"]
for col in miss_val_categ_columns:
  df[col].fillna(df[col].mode()[0],inplace=True)

In [None]:
# For numerical columns we will first make histograms to check the distribution
# of the data.
plt.figure(figsize=(15,10))
miss_val_num_columns = ["humidity_percent","past_waste_kg","meals_served"]
for i,col in enumerate(miss_val_num_columns):
  plt.subplot(3,3,i+1)
  sns.histplot(df[col],kde=True)
  plt.title(f"Disribution of {col}")
plt.tight_layout()
plt.show()

In [None]:
# From the above histograms the 'humidity_percent' and 'past_waste_kg' are
# almost normally disribted while 'meals_served' is skewed, so we will impute
# the above two columns with mean and 'meals_served' with median.
numerical_cols_mean = ['humidity_percent', 'past_waste_kg']
for col in numerical_cols_mean:
    df[col].fillna(df[col].mean(), inplace=True)

df['meals_served'].fillna(df['meals_served'].median(), inplace=True)

In [None]:
# Changing the date format.
df['date']=pd.to_datetime(df['date'], errors='coerce')

# **3**. **Summary Statistics of the continuous columns and handling outliers.**

In [None]:
numerical_cont_columns = ['meals_served','temperature_C','humidity_percent','past_waste_kg']
df[numerical_cont_columns].describe()

In [None]:
# Handling Outliers

# Boxplot of the of the numeric continuous columns to check the
# outliers.
plt.figure(figsize=(15,5))
for i,col in enumerate(numerical_cont_columns):
  plt.subplot(2,2,i+1)
  sns.boxplot(y=df[col])
  plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# From the above plots we can see there are outliers in some columns, so we will
# first confirm the outliers with the iqr method.
for col in numerical_cont_columns:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3-Q1
  lower_bound = Q1-1.5*IQR
  upper_bound = Q3+1.5*IQR
  outliers = df[(df[col]<lower_bound)|(df[col]>upper_bound)]
  print(f'\nThe number of outliers in {col} are {len(outliers)}')
  display(outliers[col].head(3))

In [None]:
# Now we will Cap the outliers with the lower_bound and upper_bound values.
for col in numerical_cont_columns:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3-Q1
  lower_bound = Q1-1.5*IQR
  upper_bound = Q3+1.5*IQR
  df[col] = df[col].clip(upper=upper_bound,lower=lower_bound)

# **4. Visualizing Distribution.**

In [None]:
# Histograms and Boxplots to visualiza the numeric columns.
plt.figure(figsize=(15,10))
for i,col in enumerate(numerical_cont_columns):
  plt.subplot(4,4,2*i+1)
  sns.histplot(df[col],kde=True)
  plt.title(f'Histogram of {col}')
  plt.subplot(4,4,2*i+2)
  sns.boxplot(y=df[col],color='orange')
  plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# Countplots to visualize categorical columns.
cat_columns = ['kitchen_staff','day_of_week','special_event','staff_experience','waste_category']
plt.figure(figsize=(15,10))
for i, col in enumerate(cat_columns):
  plt.subplot(3,3,i+1)
  sns.countplot(x=df[col])
  plt.title(f'Countplot of {col}')
  plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis of the numeric columns.
df.corr(numeric_only=True)

In [None]:
# Heatmap of the correlation matrix
plt.figure(figsize=(14,8))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='coolwarm')
plt.title(f'Correlation Heatmap')
plt.show()# Correlation Analysis of the numeric columns.

In [None]:
# Barplot for comparing food waste across waste_category and staff_experience.
cat_features = ['waste_category','staff_experience']
plt.figure(figsize=(15,5))
for i,col in enumerate(cat_features):
  plt.subplot(1,2,i+1)
  sns.barplot(x=df[col], y=df['past_waste_kg'])
  plt.title(f'Food waste by {col}')
plt.tight_layout()
plt.show()

# **5. Key Insights and Recommendations**

In [None]:
# Staffing optimization.
plt.figure(figsize=(15,5))
sns.barplot(x=df['kitchen_staff'],y=df['past_waste_kg'])
plt.title(f"Barplot of kitchen_staff vs. past_waste_kg")
plt.show()

In [None]:
# Environmental factors.
env_factors = ['temperature_C','humidity_percent']
plt.figure(figsize=(15,5))
for i,col in enumerate(env_factors):
  plt.subplot(1,2,i+1)
  sns.scatterplot(x=df[col],y=df['past_waste_kg'])
  plt.title(f'Food waste vs. {col}')
plt.tight_layout()
plt.show()

In [None]:
# Event Management
plt.figure(figsize=(5,5))
sns.barplot(x=df['special_event'],y=df['past_waste_kg'])
plt.title(f'Food waste vs. special events')
plt.show()