In [2]:
import pandas as pd

In [5]:
df = pd.read_csv('product_emissions.csv')

In [59]:
# Convert the column to a numeric data type
df['upstream_percent_total_pcf'] = pd.to_numeric(df['upstream_percent_total_pcf'].replace('%', '', regex=True), errors='coerce')
df['operations_percent_total_pcf'] = pd.to_numeric(df['operations_percent_total_pcf'].replace('%', '', regex=True), errors='coerce')
df['downstream_percent_total_pcf'] = pd.to_numeric(df['downstream_percent_total_pcf'].replace('%', '', regex=True), errors='coerce')

# Calculate the mean of non-NaN values
mean_value_upstream = round(df['upstream_percent_total_pcf'].mean(skipna=True), 2)
mean_value_operations = round(df['operations_percent_total_pcf'].mean(skipna=True), 2)
mean_value_total = round(df['downstream_percent_total_pcf'].mean(skipna=True), 2)

# Replace NaN values with the mean
df['upstream_percent_total_pcf'] = df['upstream_percent_total_pcf'].fillna(mean_value_upstream)
df['operations_percent_total_pcf'] = df['operations_percent_total_pcf'].fillna(mean_value_operations)
df['downstream_percent_total_pcf'] = df['downstream_percent_total_pcf'].fillna(mean_value_total)

In [60]:
# Standardizing Data Formats
df["country"] = df["country"].replace({"USA": "United States"})

In [61]:
# Removing Duplicates
df = df.drop_duplicates(subset="id")

In [62]:
# Round weight_kg to 2 decimal places
df["weight_kg"] = df["weight_kg"].round(2)

In [63]:
# Remove double quotes from the industry_group and product_name and company columns
df["industry_group"] = df["industry_group"].str.strip('"')
df["product_name"] = df["product_name"].str.strip('"')
df["company"] = df["company"].str.strip('"')

In [64]:
import re

def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

df['product_name'] = df['product_name'].apply(clean_text)

df['product_name'] = df['product_name'].str.strip()

In [65]:
# Exporting to CSV
df.to_csv('product_emissions_clean.csv', index=False)