In [None]:
###########################
import pandas as pd

# Load data (adjust the path as needed)
data = pd.read_excel(r"C:\Users\HP\Downloads\Updated_CO2.xlsx")

product_col = "Product Name"
material_col = "Raw Material"
mass_col = "Illustrative Mass (kg - AD)"
ef_col = "Illustrative EF (kg CO2​e/kg)"

results = []
all_max_materials = 0

# First, find the maximum number of raw materials for any product
for product, grp in data.groupby(product_col):
    all_max_materials = max(all_max_materials, len(grp))

# Main transformation loop
for product, grp in data.groupby(product_col):
    row = {
        "Product": product,
        "Raw Materials": ", ".join(grp[material_col].astype(str))
    }
    # List for sum
    emissions = []
    # Add mass and EF for each material, padding with None if fewer materials
    for idx, (_, r) in enumerate(grp.iterrows(), start=1):
        row[f"mass_{idx}"] = r[mass_col]
        row[f"ef_{idx}"] = r[ef_col]
        emissions.append(r[mass_col] * r[ef_col])
    # Padding for missing materials
    for idx in range(len(grp)+1, all_max_materials+1):
        row[f"mass_{idx}"] = None
        row[f"ef_{idx}"] = None
    # Add formula column (sum of emission per material)
    row["Total Emissions"] = sum(emissions)
    results.append(row)

# Build DataFrame
table = pd.DataFrame(results)

table.to_csv('table2.csv')


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Assuming 'table' is the DataFrame as above
# If reading from file: table = pd.read_excel('reshaped_products.xlsx')

# Select only mass and ef columns for features
feature_cols = [col for col in table.columns if col.startswith('mass_') or col.startswith('ef_')]
X = table[feature_cols].fillna(0)  # Fill NaN with 0
y = table['Total Emissions']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
print("MAE:", mean_absolute_error(y_test, y_pred))


MAE: 19.461323888888884


In [33]:
from sklearn.metrics import r2_score

print("R^2 score (accuracy):", r2_score(y_test, y_pred))


R^2 score (accuracy): 0.7888498602723957


In [None]:
#######################
df = pd.read_csv(r'C:\Users\HP\Downloads\carbon\diversified_ecommerce_dataset.csv')
df


Unnamed: 0,Product ID,Product Name,Category,Price,Discount,Tax Rate,Stock Level,Supplier ID,Customer Age Group,Customer Location,Customer Gender,Shipping Cost,Shipping Method,Return Rate,Seasonality,Popularity Index
0,P6879,Jacket,Apparel,53.85,5,15,150,S535,35-44,"New York, USA",Male,23.32,Standard,4.49,Yes,56
1,P5132,Camera,Electronics,761.26,10,15,224,S583,25-34,"London, UK",Female,20.88,Overnight,16.11,No,79
2,P2941,Sneakers,Footwear,1756.76,5,8,468,S118,25-34,"Tokyo, Japan",Non-Binary,16.43,Standard,4.93,No,40
3,P8545,Cookbooks,Books,295.24,10,15,25,S104,18-24,"Paris, France",Female,27.49,Standard,1.31,No,93
4,P4594,Camera,Electronics,832.00,10,12,340,S331,55+,"Tokyo, Japan",Male,45.93,Overnight,4.37,No,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,P1024,Graphic Novels,Books,1802.12,5,10,340,S865,55+,"Sydney, Australia",Non-Binary,6.21,Express,10.87,No,21
999996,P2259,Biographies,Books,1843.11,10,15,189,S176,45-54,"Paris, France",Non-Binary,9.02,Overnight,19.19,No,59
999997,P1110,Formal Shoes,Footwear,1186.56,25,15,482,S530,18-24,"Berlin, Germany",Male,42.85,Express,8.77,Yes,12
999998,P5954,Jeans,Apparel,803.06,25,10,145,S818,55+,"Dubai, UAE",Non-Binary,22.54,Standard,11.27,Yes,44


In [None]:

######
new_df = df[['Category','Product Name','Popularity Index','Shipping Method']]
df.sort_values(by='Popularity Index', ascending=False)['Product Name']

296747             Flats
437087         Textbooks
237353            Comics
177657      Hiking Shoes
812176    Gaming Console
               ...      
308267             Shirt
308262            Laptop
112891         Magazines
910429    Vacuum Cleaner
376036     Running Shoes
Name: Product Name, Length: 1000000, dtype: object

In [None]:
###########################
join_1 = pd.read_csv(r"C:\Users\HP\Downloads\product_raw_materials.csv")
join_1.head()

Unnamed: 0,Product,Raw Materials,Emission Source Category
0,Jacket,"Cotton, Polyester, Nylon, Leather, Metal Zippe...",Textile manufacturing
1,Camera,"Aluminum, Copper, Glass, Plastic, Lithium, Sil...",Electronics manufacturing
2,Sneakers,"Rubber, EVA Foam, Polyester, Leather, Adhesives",Footwear production
3,Cookbooks,"Paper (Wood Pulp), Ink, Glue, Coatings",Paper & printing industry
4,Non-Fiction,"Paper (Wood Pulp), Ink, Glue, Coatings",Paper & printing industry


In [None]:
##############################
join_1.rename(columns={
    'Product': 'Product Name',
    
}, inplace=True)



In [None]:
################################
df_final = new_df.merge(
    join_1,
    on='Product Name',  # The common column used for the join
    how='left'         # Keeps all rows from the left table (df_reference)
)

In [42]:
df_final.to_csv('dataset.csv')

In [None]:
#########################
df_final.head(10)

Unnamed: 0,Category,Product Name,Popularity Index,Shipping Method,Raw Materials,Emission Source Category
0,Apparel,Jacket,56,Standard,"Cotton, Polyester, Nylon, Leather, Metal Zippe...",Textile manufacturing
1,Electronics,Camera,79,Overnight,"Aluminum, Copper, Glass, Plastic, Lithium, Sil...",Electronics manufacturing
2,Footwear,Sneakers,40,Standard,"Rubber, EVA Foam, Polyester, Leather, Adhesives",Footwear production
3,Books,Cookbooks,93,Standard,"Paper (Wood Pulp), Ink, Glue, Coatings",Paper & printing industry
4,Electronics,Camera,56,Overnight,"Aluminum, Copper, Glass, Plastic, Lithium, Sil...",Electronics manufacturing
5,Books,Non-Fiction,91,Express,"Paper (Wood Pulp), Ink, Glue, Coatings",Paper & printing industry
6,Footwear,Running Shoes,41,Overnight,"Rubber, EVA Foam, Polyester, Leather, Adhesives",Footwear production
7,Home Appliances,Blender,60,Standard,"Steel, Plastic, Copper, Aluminum, Glass",Appliance manufacturing
8,Books,Fiction,90,Express,"Paper (Wood Pulp), Ink, Glue, Coatings",Paper & printing industry
9,Footwear,Heels,96,Overnight,"Leather, Rubber, Plastic, Adhesives",Footwear production


In [None]:
########################
new_trim = pd.read_csv('dataset.csv')
new_trim.drop('Raw Materials',axis=1,inplace =True)
new_trim.to_csv('table1.csv', index=False)

In [None]:
################
table1_1 = pd.read_csv('table2.csv')
table1_1.rename(columns={'Product' : 'Product Name'},inplace=True)
table1_1.to_csv('table1-1.csv',index=False)

In [None]:
#######################
import pandas as pd

# Load your two tables
# Example if you have CSV files:
table1 = pd.read_csv('table1.csv')  # first table: categories, popularity, etc.
table2 = pd.read_csv('table1-1.csv')  # second table: raw materials, masses, emissions

# Example: If already in DataFrames:
# Make sure 'Product Name' columns match exactly (no extra spaces, same case)
# Optionally, strip spaces:
table1['Product Name'] = table1['Product Name'].str.strip()
table2['Product Name'] = table2['Product Name'].str.strip()

# Perform the merge
merged = pd.merge(
    table2,
    table1,
    on='Product Name',
    how='left'  # Use 'left' to keep all rows/columns from second table
)
merged.to_csv('processed.csv',index=  False)




Unnamed: 0,Unnamed: 0_x,Product Name,Raw Materials,mass_1,ef_1,mass_2,ef_2,mass_3,ef_3,mass_4,...,mass_6,ef_6,mass_7,ef_7,Total Emissions,Unnamed: 0_y,Category,Popularity Index,Shipping Method,Emission Source Category
0,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,69,Home Appliances,2,Standard,Appliance manufacturing
1,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,119,Home Appliances,36,Standard,Appliance manufacturing
2,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,146,Home Appliances,62,Express,Appliance manufacturing
3,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,260,Home Appliances,18,Standard,Appliance manufacturing
4,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,321,Home Appliances,33,Overnight,Appliance manufacturing
5,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,356,Home Appliances,75,Standard,Appliance manufacturing
6,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,358,Home Appliances,75,Standard,Appliance manufacturing
7,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,450,Home Appliances,3,Express,Appliance manufacturing
8,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,530,Home Appliances,55,Standard,Appliance manufacturing
9,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,566,Home Appliances,5,Overnight,Appliance manufacturing


In [19]:
merged.head(10)

Unnamed: 0,Unnamed: 0_x,Product Name,Raw Materials,mass_1,ef_1,mass_2,ef_2,mass_3,ef_3,mass_4,...,mass_6,ef_6,mass_7,ef_7,Total Emissions,Unnamed: 0_y,Category,Popularity Index,Shipping Method,Emission Source Category
0,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,69,Home Appliances,2,Standard,Appliance manufacturing
1,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,119,Home Appliances,36,Standard,Appliance manufacturing
2,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,146,Home Appliances,62,Express,Appliance manufacturing
3,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,260,Home Appliances,18,Standard,Appliance manufacturing
4,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,321,Home Appliances,33,Overnight,Appliance manufacturing
5,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,356,Home Appliances,75,Standard,Appliance manufacturing
6,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,358,Home Appliances,75,Standard,Appliance manufacturing
7,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,450,Home Appliances,3,Express,Appliance manufacturing
8,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,530,Home Appliances,55,Standard,Appliance manufacturing
9,0,Air Conditioner,"Steel, Copper, Aluminum, Plastic, HFCs (Refrig...",50.0,2.0,5.0,6.5,15.0,17.0,10.0,...,,,,,443.5,566,Home Appliances,5,Overnight,Appliance manufacturing


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load your data
df = pd.read_csv('processed.csv')

# List feature columns (all mass and ef columns)
feature_cols = [col for col in df.columns if col.startswith('mass_') or col.startswith('ef_')]
# Prepare input (X) and target (y)
X = df[feature_cols].fillna(0) # fill NaN with 0 for missing materials
import numpy as np

np.random.seed(42)
noise = np.random.normal(0, 10, size=len(df))  # mean=0, stdev=10
df['Noisy Emissions'] = df['Total Emissions'] + noise

y = df['Noisy Emissions']

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Build and train model
model = RandomForestRegressor(random_state=42,)
model.fit(X_train, y_train)

# Make predictions and print evaluation metrics
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))


MAE: 7.968809630865887
R^2: 0.9893692496822023


In [25]:
df['Product Name'].unique()


array(['Air Conditioner', 'Biographies', 'Blender', 'Boots', 'Camera',
       'Comics', 'Cookbooks', 'Dishwasher', 'Dress', 'Fiction', 'Flats',
       'Formal Shoes', 'Gaming Console', 'Graphic Novels', 'Headphones',
       'Heels', 'Hiking Shoes', 'Jacket', 'Jeans', 'Laptop', 'Magazines',
       'Microwave', 'Monitor', 'Non-Fiction', 'Refrigerator',
       'Running Shoes', 'Sandals', 'Shirt', 'Shorts', 'Skirt', 'Slippers',
       'Smartphone', 'Smartwatch', 'Sneakers', 'Socks', 'Speaker',
       'Sweater', 'T-shirt', 'Tablet', 'Textbooks', 'Toaster',
       'Vacuum Cleaner', 'Washing Machine'], dtype=object)

In [None]:
###################################
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load your data
df = pd.read_csv('processed.csv')

# Noise ranges per product (min, max)
noise_ranges = {
    "Air Conditioner": (700, 900),
    "Biographies": (1, 5),
    "Blender": (60, 100),
    "Boots": (15, 25),
    "Camera": (60, 100),
    "Comics": (1, 3),
    "Cookbooks": (1, 3),
    "Dishwasher": (500, 700),
    "Dress": (10, 20),
    "Fiction": (1, 5),
    "Flats": (10, 20),
    "Formal Shoes": (10, 20),
    "Gaming Console": (150, 250),
    "Graphic Novels": (1, 3),
    "Headphones": (50, 90),
    "Heels": (10, 20),
    "Hiking Shoes": (15, 25),
    "Jacket": (15, 25),
    "Jeans": (15, 25),
    "Laptop": (200, 300),
    "Magazines": (1, 3),
    "Microwave": (150, 250),
    "Monitor": (200, 300),
    "Non-Fiction": (1, 5),
    "Refrigerator": (600, 800),
    "Running Shoes": (10, 20),
    "Sandals": (5, 15),
    "Shirt": (10, 20),
    "Shorts": (10, 20),
    "Skirt": (10, 20),
    "Slippers": (5, 15),
    "Smartphone": (55, 85),
    "Smartwatch": (55, 85),
    "Sneakers": (10, 20),
    "Socks": (5, 10),
    "Speaker": (100, 150),
    "Sweater": (10, 20),
    "T-shirt": (5, 15),
    "Tablet": (150, 250),
    "Textbooks": (2, 5),
    "Toaster": (60, 100),
    "Vacuum Cleaner": (150, 250),
    "Washing Machine": (500, 700)
}

np.random.seed(42)

# Add noise based on Product Name
import numpy as np

def add_small_noise(row):
    min_noise, max_noise = noise_ranges.get(row['Product Name'], (0, 0))
    noise = np.random.uniform(min_noise * 0.1, max_noise * 0.1)  # Use only 10% of old range
    sign = np.random.choice([-1, 1])
    return row['Total Emissions'] + sign * noise

df['Noisy Emissions'] = df.apply(add_small_noise, axis=1)


# Prepare features and target
feature_cols = [col for col in df.columns if col.startswith('mass_') or col.startswith('ef_')]
X = df[feature_cols].fillna(0)
y = df['Noisy Emissions']

# Train/test split and fit model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics
print("MAE:", mean_absolute_error(y_test, y_pred)) 
print("R^2:", r2_score(y_test, y_pred)) 


MAE: 11.658889042109157
R^2: 0.945382625722169


In [None]:
##################
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# (X and y as before...)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestRegressor(random_state=42)

# R^2 score across folds
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
# MAE across folds (note: 'neg_mean_absolute_error' returns negative values)
mae_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')

print("KFold R^2 scores:", cv_scores)
print("Mean R^2:", np.mean(cv_scores))
print("KFold MAE scores:", -mae_scores)
print("Mean MAE:", -np.mean(mae_scores))



KFold R^2 scores: [0.94538237 0.94472073 0.94484964 0.94488407 0.94511074]
Mean R^2: 0.9449895120723312
KFold MAE scores: [11.65888117 11.66073376 11.80618503 11.81438008 11.62725315]
Mean MAE: 11.713486638861983
