In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/raw/eco_packaging_materials.csv")

df.head()


Unnamed: 0,material_id,material_name,material_type,strength_score,weight_capacity_kg,biodegradability_score,recyclability_percent,co2_emission_kg,cost_per_kg,water_resistance,food_safe,industry_usage,sustainability_rating
0,1,Corrugated Cardboard,Paper-Based,7.5,15.0,0.85,90,0.7,1.2,Medium,True,E-commerce,A
1,2,Molded Pulp,Pulp-Based,6.8,10.0,0.95,95,0.6,1.5,Low,True,Electronics,A
2,3,PLA (Corn Starch),Plant-Based Polymer,8.2,20.0,0.7,0,1.4,2.8,High,True,Food,B
3,4,Glass Jar (Virgin),Glass,9.5,30.0,0.0,100,1.8,3.5,High,True,Beverages,B
4,5,Aluminum Foil,Metal,5.0,2.0,0.0,95,2.2,4.0,High,True,Food,C


In [3]:
print("Shape of dataset:", df.shape)
print("\nColumn names:\n", df.columns)
print("\nDataset Info:")
df.info()



Shape of dataset: (100, 13)

Column names:
 Index(['material_id', 'material_name', 'material_type', 'strength_score',
       'weight_capacity_kg', 'biodegradability_score', 'recyclability_percent',
       'co2_emission_kg', 'cost_per_kg', 'water_resistance', 'food_safe',
       'industry_usage', 'sustainability_rating'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   material_id             100 non-null    int64  
 1   material_name           100 non-null    object 
 2   material_type           100 non-null    object 
 3   strength_score          100 non-null    float64
 4   weight_capacity_kg      100 non-null    float64
 5   biodegradability_score  100 non-null    float64
 6   recyclability_percent   100 non-null    int64  
 7   co2_emission_kg         100 non-null    float64
 8   c

In [4]:
print("Missing values per column:")
df.isnull().sum()


Missing values per column:


material_id               0
material_name             0
material_type             0
strength_score            0
weight_capacity_kg        0
biodegradability_score    0
recyclability_percent     0
co2_emission_kg           0
cost_per_kg               0
water_resistance          0
food_safe                 0
industry_usage            0
sustainability_rating     0
dtype: int64

In [5]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df.columns


Index(['material_id', 'material_name', 'material_type', 'strength_score',
       'weight_capacity_kg', 'biodegradability_score', 'recyclability_percent',
       'co2_emission_kg', 'cost_per_kg', 'water_resistance', 'food_safe',
       'industry_usage', 'sustainability_rating'],
      dtype='object')

In [6]:
# Fill numerical missing values with mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Fill categorical missing values
df.fillna("Unknown", inplace=True)


In [7]:
print("Duplicates before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicates after:", df.duplicated().sum())


Duplicates before: 0
Duplicates after: 0


In [8]:
df.describe()


Unnamed: 0,material_id,strength_score,weight_capacity_kg,biodegradability_score,recyclability_percent,co2_emission_kg,cost_per_kg
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,50.5,7.006,21.602,0.6281,63.7,0.941,3.088
std,29.011492,2.127351,28.670676,0.406278,41.836744,0.654911,3.136532
min,1.0,1.0,0.1,0.0,0.0,0.1,0.5
25%,25.75,5.5,5.0,0.1,15.0,0.5,1.5
50%,50.5,7.45,13.0,0.85,90.0,0.8,2.35
75%,75.25,8.8,25.0,0.96,100.0,1.2,3.5
max,100.0,9.9,200.0,1.0,100.0,3.5,25.0


In [10]:
df["co2_impact_index"] = df["co2_emission_kg"] / df["weight_capacity_kg"]


In [11]:
df["cost_efficiency_index"] = df["weight_capacity_kg"] / df["cost_per_kg"]


In [12]:
df["material_suitability_score"] = (
    df["biodegradability_score"] +
    (df["recyclability_percent"] / 100) +
    df["cost_efficiency_index"]
)


In [13]:
df[
    [
        "co2_impact_index",
        "cost_efficiency_index",
        "material_suitability_score"
    ]
].head()


Unnamed: 0,co2_impact_index,cost_efficiency_index,material_suitability_score
0,0.046667,12.5,14.25
1,0.06,6.666667,8.566667
2,0.07,7.142857,7.842857
3,0.06,8.571429,9.571429
4,1.1,0.5,1.45


In [14]:
df.to_csv("../data/processed/cleaned_materials_week2.csv", index=False)
print("✅ Week 2 cleaned dataset saved")


✅ Week 2 cleaned dataset saved
