In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

file_path = '/content/foodWatageData.csv'
data = pd.read_csv(file_path)
print(data.info())
print(data.head())


In [None]:
def clean_dataset(df):
    df.columns = df.columns.str.strip()

    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip()

    df = df.drop_duplicates()
    df = df.dropna()

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
        elif df[col].dtype in ['int64', 'float64']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df = df[df[col] >= 0]

    df.reset_index(drop=True, inplace=True)

    return df
cleaned_data = clean_dataset(data)

print(cleaned_data.info())
print(cleaned_data.head())


In [None]:
cleaned_data.to_csv('cleaned_food_waste.csv', index=False)
print(f"Cleaned dataset shape: {cleaned_data.shape}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cleaned_data['Food_Waste'] = cleaned_data['Household estimate (tonnes/year)'] + \
                             cleaned_data['Retail estimate (tonnes/year)'] + \
                             cleaned_data['Food service estimate (tonnes/year)']

sns.histplot(cleaned_data['Food_Waste'], kde=True, bins=30, color='blue')
plt.title('Distribution of Food Waste')
plt.xlabel('Food Waste (tons or kilograms)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
numeric_data = cleaned_data.select_dtypes(include=np.number)
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
top_countries = cleaned_data.groupby('Country')['Food_Waste'].sum().nlargest(10)

plt.figure(figsize=(12, 6))
bars = plt.bar(top_countries.index, top_countries.values, color='darkviolet', width=0.6)
plt.title('Top 10 Countries by Food Waste')
plt.xlabel('Country')
plt.ylabel('Total Food Waste')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
top_countries = cleaned_data[['Country', 'combined figures (kg/capita/year)']].sort_values(by='combined figures (kg/capita/year)', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x='Country', y='combined figures (kg/capita/year)', data=top_countries, palette='viridis')
plt.title('Top 10 Countries by Food Waste (kg/capita/year)')
plt.xlabel('Country')
plt.ylabel('Food Waste (kg/capita/year)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
region_waste = cleaned_data.groupby('Region')['combined figures (kg/capita/year)'].sum().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
region_waste.plot(kind='bar', color='orange')
plt.title('Food Waste by Region (kg/capita/year)')
plt.xlabel('Region')
plt.ylabel('Food Waste (kg/capita/year)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

cleaned_data = pd.get_dummies(cleaned_data, columns=['Region', 'Confidence in estimate'])

X = cleaned_data.drop(columns=['Country', 'combined figures (kg/capita/year)', 'Source'])
y = cleaned_data['combined figures (kg/capita/year)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [50]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validation MSE: {-scores.mean()}')

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Food Waste per Capita')
plt.ylabel('Predicted Food Waste per Capita')
plt.title('Actual vs. Predicted Values')
plt.show()

In [None]:
data['Total Waste'] = data['Household estimate (tonnes/year)'] + \
                      data['Retail estimate (tonnes/year)'] + \
                      data['Food service estimate (tonnes/year)'] + \
                      0
reduction_factor = 0.1
reduced_total_waste = data['Total Waste'] * (1 - reduction_factor)
total_reduced_waste = reduced_total_waste.sum()

print(f"\nIf food waste could be reduced by 10%, the total reduction in waste would be: {total_reduced_waste:.2f} tons")

In [None]:
print("\nConclusion:")
print("Based on our analysis, the most impactful source for reducing food waste is identified.")
print("Recommendations include targeted programs, technological solutions, and community awareness campaigns.")
print("Predictive modeling indicates that by implementing strategies to reduce food waste by 10%, significant amounts of waste can be avoided.")
