In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd

carbon_emission_data = pd.read_csv('Carbon Emission.csv')

# Define the features and target
X = carbon_emission_data.drop(columns=['CarbonEmission'])
y = carbon_emission_data['CarbonEmission']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64']).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create and evaluate the pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')






Mean Absolute Error: 216.698465
R-squared: 0.921531617056217


In [7]:
# Get the feature names
# Feature importance
feature_importances = clf.named_steps['model'].feature_importances_

print('Feature importances:')
for i, col in enumerate(numerical_cols):
    print(f'{col}: {feature_importances[i]}')

# # lets predict a human based on the following data
#Body Type	Sex	Diet	How Often Shower	Heating Energy Source	Transport	Vehicle Type	Social Activity	Monthly Grocery Bill	Frequency of Traveling by Air	Vehicle Monthly Distance Km	Waste Bag Size	Waste Bag Weekly Count	How Long TV PC Daily Hour	How Many New Clothes Monthly	How Long Internet Daily Hour	Energy efficiency	Recycling	Cooking_With	CarbonEmission

# lets pull a random row from the data
random_row = X.sample()
print(random_row)

# lets predict the carbon emission of the random row
random_row_pred = clf.predict(random_row)
print(f'Predicted Carbon Emission: {random_row_pred[0]}')

# what is the actual carbon emission of the random row
random_row_actual = carbon_emission_data.loc[random_row.index[0], 'CarbonEmission']
print(f'Actual Carbon Emission: {random_row_actual}')



Feature importances:
Monthly Grocery Bill: 0.0103205929518623
Vehicle Monthly Distance Km: 0.37583942884705873
Waste Bag Weekly Count: 0.02480307616219856
How Long TV PC Daily Hour: 0.0057788283664655755
How Many New Clothes Monthly: 0.046105548752750115
How Long Internet Daily Hour: 0.007858178393189861
       Body Type     Sex         Diet How Often Shower Heating Energy Source  \
5314  overweight  female  pescatarian  less frequently                  coal   

     Transport Vehicle Type Social Activity  Monthly Grocery Bill  \
5314   private     electric           never                   254   

     Frequency of Traveling by Air  Vehicle Monthly Distance Km  \
5314                    frequently                         8180   

     Waste Bag Size  Waste Bag Weekly Count  How Long TV PC Daily Hour  \
5314          large                       3                          8   

      How Many New Clothes Monthly  How Long Internet Daily Hour  \
5314                            17        