In [117]:
# Import libraries
import pandas as pd

In [118]:
# load data
carbon = pd.read_csv('data/carbon_footprint_by_product.csv')
greenhouse = pd.read_csv('data/greenhouse_gas_emissions.csv')
factors = pd.read_csv('data/normalizing_factors.csv')

In [119]:
greenhouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Fiscal Year  136 non-null    int64  
 1   Category     136 non-null    object 
 2   Type         136 non-null    object 
 3   Scope        120 non-null    object 
 4   Description  136 non-null    object 
 5   Emissions    109 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 6.5+ KB


In [120]:
carbon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      10 non-null     int64 
 1   Product           10 non-null     object
 2   Baseline Storage  10 non-null     int64 
 3   Carbon Footprint  10 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 448.0+ bytes


In [121]:
factors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Fiscal Year            8 non-null      int64
 1   Revenue                8 non-null      int64
 2   Market Capitalization  8 non-null      int64
 3   Employees              8 non-null      int64
dtypes: int64(4)
memory usage: 384.0 bytes


In [122]:
carbon

Unnamed: 0,Release Year,Product,Baseline Storage,Carbon Footprint
0,2023,iPhone 15,128,56
1,2022,iPhone 14,128,61
2,2021,iPhone 13,128,64
3,2020,iPhone 12,64,70
4,2019,iPhone 11,64,72
5,2018,iPhone Xs,64,70
6,2017,iPhone X,64,79
7,2017,iPhone 8,64,57
8,2016,iPhone 7,32,56
9,2015,iPhone 6s,32,54


In [123]:
greenhouse

Unnamed: 0,Fiscal Year,Category,Type,Scope,Description,Emissions
0,2022,Corporate emissions,Gross emissions,Scope 1,"Natural gas, diesel, propane",39700.0
1,2022,Corporate emissions,Gross emissions,Scope 1,Fleet vehicles,12600.0
2,2022,Corporate emissions,Gross emissions,Scope 1,Other (R&D processes & refrigerant leaks),2900.0
3,2022,Corporate emissions,Gross emissions,Scope 2 (market-based),Electricity,0.0
4,2022,Corporate emissions,Gross emissions,Scope 2 (market-based),"Steam, heating, and cooling",3000.0
...,...,...,...,...,...,...
131,2015,Product life cycle emissions,Gross emissions,Scope 3,Manufacturing (purchased goods and services),29600000.0
132,2015,Product life cycle emissions,Gross emissions,Scope 3,Product transportation (upstream and downstream),1300000.0
133,2015,Product life cycle emissions,Gross emissions,Scope 3,Product use (use of sold products),6600000.0
134,2015,Product life cycle emissions,Gross emissions,Scope 3,End-of-life processing,500000.0


In [124]:
factors

Unnamed: 0,Fiscal Year,Revenue,Market Capitalization,Employees
0,2022,394328,2490,164000
1,2021,365817,2450,154000
2,2020,274515,1720,147000
3,2019,260174,1090,137000
4,2018,265595,830,132000
5,2017,229234,740,123000
6,2016,215639,600,116000
7,2015,233715,580,110000


In [125]:
# Inspect rows with missing values in 'Scope' and 'Emissions' columns
missing_scope = greenhouse[greenhouse['Scope'].isnull()]
missing_emissions = greenhouse[greenhouse['Emissions'].isnull()]

missing_scope, missing_emissions

(     Fiscal Year                      Category             Type Scope  \
 11          2022           Corporate emissions  Carbon removals   NaN   
 16          2022  Product life cycle emissions  Carbon removals   NaN   
 28          2021           Corporate emissions  Carbon removals   NaN   
 33          2021  Product life cycle emissions  Carbon removals   NaN   
 45          2020           Corporate emissions  Carbon removals   NaN   
 50          2020  Product life cycle emissions  Carbon removals   NaN   
 62          2019           Corporate emissions  Carbon removals   NaN   
 67          2019  Product life cycle emissions  Carbon removals   NaN   
 79          2018           Corporate emissions  Carbon removals   NaN   
 84          2018  Product life cycle emissions  Carbon removals   NaN   
 96          2017           Corporate emissions  Carbon removals   NaN   
 101         2017  Product life cycle emissions  Carbon removals   NaN   
 113         2016           Corporate 

In [126]:
# Fill missing Scope values related to "Carbon removals" with "Unknown"
greenhouse.loc[greenhouse['Scope'].isnull() &
                                (greenhouse['Type'] == 'Carbon removals'), 'Scope'] = 'Unknown'

# Fill missing Scope values with median
greenhouse['Emissions'].fillna(greenhouse['Emissions'].median(), inplace=True)

In [127]:
# Verify that there are no more missing values
missing_values_after_cleaning = greenhouse.isnull().sum()
print(missing_values_after_cleaning)

Fiscal Year    0
Category       0
Type           0
Scope          0
Description    0
Emissions      0
dtype: int64


In [128]:
carbon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      10 non-null     int64 
 1   Product           10 non-null     object
 2   Baseline Storage  10 non-null     int64 
 3   Carbon Footprint  10 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 448.0+ bytes


In [129]:
# Export cleaned data
greenhouse.to_csv('data/greenhouse_clean.csv', index=False)
carbon.to_csv('data/carbon_clean.csv', index=False)
factors.to_csv('data/factors_clean.csv', index=False)