In [4]:
!pip install tensorflow_decision_forests



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow_decision_forests as tfdf
from sklearn.metrics import mean_absolute_error, r2_score

In [6]:
# Load the dataset
file_path = 'Carbon Emission.csv'
df = pd.read_csv(file_path)

# Check loading worked
print(df.head())
print(df.info())

# Check for missing values
print(df.isnull().sum())

    Body Type     Sex         Diet How Often Shower Heating Energy Source  \
0  overweight  female  pescatarian            daily                  coal   
1       obese  female   vegetarian  less frequently           natural gas   
2  overweight    male     omnivore  more frequently                  wood   
3  overweight    male     omnivore      twice a day                  wood   
4       obese  female   vegetarian            daily                  coal   

      Transport Vehicle Type Social Activity  Monthly Grocery Bill  \
0        public          NaN           often                   230   
1  walk/bicycle          NaN           often                   114   
2       private       petrol           never                   138   
3  walk/bicycle          NaN       sometimes                   157   
4       private       diesel           often                   266   

  Frequency of Traveling by Air  Vehicle Monthly Distance Km Waste Bag Size  \
0                    frequently      

In [7]:
# Based on previous work we know Vehicle has null values
nullVehicleType = df.loc[df['Vehicle Type'].isnull()]
print(nullVehicleType['Transport'].unique())

# Updating Vehicle
dfNonNull = df.fillna('None')
print(dfNonNull.isnull().sum())

# Display the first few rows after handling missing values
print(dfNonNull.head())

['public' 'walk/bicycle']
Body Type                        0
Sex                              0
Diet                             0
How Often Shower                 0
Heating Energy Source            0
Transport                        0
Vehicle Type                     0
Social Activity                  0
Monthly Grocery Bill             0
Frequency of Traveling by Air    0
Vehicle Monthly Distance Km      0
Waste Bag Size                   0
Waste Bag Weekly Count           0
How Long TV PC Daily Hour        0
How Many New Clothes Monthly     0
How Long Internet Daily Hour     0
Energy efficiency                0
Recycling                        0
Cooking_With                     0
CarbonEmission                   0
dtype: int64
    Body Type     Sex         Diet How Often Shower Heating Energy Source  \
0  overweight  female  pescatarian            daily                  coal   
1       obese  female   vegetarian  less frequently           natural gas   
2  overweight    male     omni

In [8]:
# I also could not get pipeline to play nice with random forests. Re-used this from KNN
# Map each categorical column to numeric values
dfNonNull['Body Type'] = dfNonNull['Body Type'].map({'overweight': 0, 'obese': 1, 'underweight': 2, 'normal': 3})
dfNonNull['Sex'] = dfNonNull['Sex'].map({'female': 0, 'male': 1})
dfNonNull['Diet'] = dfNonNull['Diet'].map({'pescatarian': 0, 'vegetarian': 1, 'omnivore': 2, 'vegan': 3})
dfNonNull['How Often Shower'] = dfNonNull['How Often Shower'].map({'daily': 0, 'less frequently': 1, 'more frequently': 2, 'twice a day': 3})
dfNonNull['Heating Energy Source'] = dfNonNull['Heating Energy Source'].map({'coal': 0, 'natural gas': 1, 'wood': 2, 'electricity': 3})
dfNonNull['Transport'] = dfNonNull['Transport'].map({'public': 0, 'walk/bicycle': 1, 'private': 2})
dfNonNull['Vehicle Type'] = dfNonNull['Vehicle Type'].map({'None': 0, 'petrol': 1, 'diesel': 2, 'hybrid': 3, 'lpg': 4, 'electric': 5})
dfNonNull['Social Activity'] = dfNonNull['Social Activity'].map({'often': 0, 'never': 1, 'sometimes': 2})
dfNonNull['Frequency of Traveling by Air'] = dfNonNull['Frequency of Traveling by Air'].map({'frequently': 0, 'rarely': 1, 'never': 2, 'very frequently': 3})
dfNonNull['Waste Bag Size'] = dfNonNull['Waste Bag Size'].map({'large': 0, 'extra large': 1, 'small': 2, 'medium': 3})
dfNonNull['Energy efficiency'] = dfNonNull['Energy efficiency'].map({'No': 0, 'Sometimes': 1, 'Yes': 2})

# Display the dataframe info after encoding
print(dfNonNull.info())
print(dfNonNull.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Body Type                      10000 non-null  int64 
 1   Sex                            10000 non-null  int64 
 2   Diet                           10000 non-null  int64 
 3   How Often Shower               10000 non-null  int64 
 4   Heating Energy Source          10000 non-null  int64 
 5   Transport                      10000 non-null  int64 
 6   Vehicle Type                   10000 non-null  int64 
 7   Social Activity                10000 non-null  int64 
 8   Monthly Grocery Bill           10000 non-null  int64 
 9   Frequency of Traveling by Air  10000 non-null  int64 
 10  Vehicle Monthly Distance Km    10000 non-null  int64 
 11  Waste Bag Size                 10000 non-null  int64 
 12  Waste Bag Weekly Count         10000 non-null  int64 
 13  Ho

In [9]:
# Drop recycling and cooking
dfNonNull = dfNonNull.drop('Recycling', axis=1)
dfNonNull = dfNonNull.drop('Cooking_With', axis=1)
print(dfNonNull.head())

   Body Type  Sex  Diet  How Often Shower  Heating Energy Source  Transport  \
0          0    0     0                 0                      0          0   
1          1    0     1                 1                      1          1   
2          0    1     2                 2                      2          2   
3          0    1     2                 3                      2          1   
4          1    0     1                 0                      0          2   

   Vehicle Type  Social Activity  Monthly Grocery Bill  \
0             0                0                   230   
1             0                0                   114   
2             1                1                   138   
3             0                2                   157   
4             2                0                   266   

   Frequency of Traveling by Air  Vehicle Monthly Distance Km  Waste Bag Size  \
0                              0                          210               0   
1             

In [10]:
# Split into features and trget
X = dfNonNull.drop(columns=['CarbonEmission'])
y = dfNonNull['CarbonEmission']

In [11]:
# Using test train split here but I am thinking there is a better way
# the TFDF model does not play nice with these datasets directly.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# tfdfd does not simply work with the pandas dataframe that train test split retuns
# we are making a copy of the dataframes and adding data frame and adding the target column to them
# This process feels like this could be simplified somehow
train_df = X_train.copy()
train_df['CarbonEmission'] = y_train

# We have to do something similar with the test set
test_df = X_test.copy()
test_df['CarbonEmission'] = y_test


In [12]:
# CReate the Model
model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION)


Use /tmp/tmpzp7_e6ac as temporary training directory


In [13]:
# Converting to the dataset that tfdf uses
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="CarbonEmission", task=tfdf.keras.Task.REGRESSION)

# Fit the model on the training data
model.fit(train_ds)





Reading training dataset...
Training dataset read in 0:00:11.731364. Found 8000 examples.
Training model...
Model trained in 0:00:24.313127
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x796c28109ab0>

In [14]:
# For the testing data, we also have to convert to a format that the model can use
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label="CarbonEmission", task=tfdf.keras.Task.REGRESSION)
# Compile the model with Mean Absolute Error (MAE) as the evaluation metric
model.compile(metrics=["mae"])

# Evaluate the model using MAE
evaluation = model.evaluate(test_ds, return_dict=True)
print(evaluation)
print(f"Mean Absolute Error: {evaluation['mae']}")





{'loss': 0.0, 'mae': 216.6640625}
Mean Absolute Error: 216.6640625


In [15]:
# Here we are getting the most important features
most_important_features = model.make_inspector().variable_importances()["NUM_AS_ROOT"]

# Print the ffeatures
print("Most important features:")
for attribute, importance in most_important_features:
    print(f"{attribute}: {importance}")

Most important features:
"Vehicle_Monthly_Distance_Km" (1; #13): 138.0
"Transport" (1; #12): 64.0
"Vehicle_Type" (1; #14): 49.0
"Frequency_of_Traveling_by_Air" (1; #3): 26.0
"Body_Type" (1; #0): 11.0
"How_Many_New_Clothes_Monthly" (1; #7): 9.0
"Sex" (1; #10): 2.0
"Waste_Bag_Weekly_Count" (1; #16): 1.0


In [16]:
# Step 11: Test the model on multiple random rows
num_tests = 10  # Number of random rows to test
for _ in range(num_tests):
    # Select a random row from the dataset
    random_row = X.sample()

    # Convert the random row to a TensorFlow dataset
    random_row_ds = tfdf.keras.pd_dataframe_to_tf_dataset(random_row, task=tfdf.keras.Task.REGRESSION)

    # Predict the carbon emission for the selected row
    random_row_pred = model.predict(random_row_ds)
    print(f'Predicted Carbon Emission: {random_row_pred[0][0]}')

    # Get the actual carbon emission for the selected row
    random_row_actual = dfNonNull.loc[random_row.index[0], 'CarbonEmission']
    print(f'Actual Carbon Emission: {random_row_actual}')
    print()





Predicted Carbon Emission: 2199.768798828125
Actual Carbon Emission: 1765





Predicted Carbon Emission: 1626.1064453125
Actual Carbon Emission: 1733





Predicted Carbon Emission: 2527.25634765625
Actual Carbon Emission: 2401





Predicted Carbon Emission: 2734.9892578125
Actual Carbon Emission: 2837





Predicted Carbon Emission: 1899.3165283203125
Actual Carbon Emission: 1899





Predicted Carbon Emission: 2165.0205078125
Actual Carbon Emission: 2173





Predicted Carbon Emission: 3480.8125
Actual Carbon Emission: 3493





Predicted Carbon Emission: 1636.6202392578125
Actual Carbon Emission: 1678





Predicted Carbon Emission: 2751.9189453125
Actual Carbon Emission: 2940





Predicted Carbon Emission: 987.2037963867188
Actual Carbon Emission: 775



In [17]:
# Lets test the model on the entire test set
y_pred = model.predict(test_ds)
y_pred = np.array(y_pred)
y_pred = y_pred.flatten()

# Calculate the Mean Absolute Error (MAE) and R^2 score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R^2 Score: {r2}")



# Save the model
model.save("decision_forest_model")

Mean Absolute Error: 216.6640626220703
R^2 Score: 0.9197274006013679
