In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load your dataset
df = pd.read_csv("green_supply_chain.csv")

# 1. Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# 2. Drop columns with too many missing values (e.g., >30%)
threshold = 0.3 * len(df)
df = df.dropna(thresh=len(df) - threshold, axis=1)

# 3. Drop rows with too many missing values (e.g., >50%)
df = df.dropna(thresh=int(df.shape[1] * 0.5))

# 4. Impute small number of missing values with mean (numerical only)
num_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

# 5. Outlier detection and removal using IQR method
def remove_outliers_iqr(dataframe, columns):
    for col in columns:
        Q1 = dataframe[col].quantile(0.25)
        Q3 = dataframe[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        dataframe = dataframe[(dataframe[col] >= lower_bound) & (dataframe[col] <= upper_bound)]
    return dataframe

df = remove_outliers_iqr(df, num_cols)

# Final check
print("Shape after preprocessing:", df.shape)
print("Remaining missing values:\n", df.isnull().sum())


Missing values per column:
 ID                          0
Product_Type                0
Raw_Material_Usage_kg       0
Energy_Consumption_kWh      0
Waste_Generated_kg          0
Transport_Distance_km       0
CO2_Emissions_kg            0
Manufacturing_Energy_kWh    0
Renewable_Energy            0
Cost                        0
Delivery_Time_days          0
Sustainability_Score        0
dtype: int64
Shape after preprocessing: (1000, 12)
Remaining missing values:
 ID                          0
Product_Type                0
Raw_Material_Usage_kg       0
Energy_Consumption_kWh      0
Waste_Generated_kg          0
Transport_Distance_km       0
CO2_Emissions_kg            0
Manufacturing_Energy_kWh    0
Renewable_Energy            0
Cost                        0
Delivery_Time_days          0
Sustainability_Score        0
dtype: int64


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('green_supply_chain.csv')

# Select only numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Function to detect outliers using IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Detect and print outliers for each numeric column
for col in numeric_cols:
    outliers = detect_outliers_iqr(df, col)
    if not outliers.empty:
        print(f"\nOutliers detected in '{col}':")
        print(outliers[[col]])
    else:
        print(f"\nNo outliers detected in '{col}'.")