In [68]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset

df = pd.read_csv('../dataset/Carbon Emission.csv')

# Display first few rows
df.head()


Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


In [69]:

# Check for missing values
df.isnull().sum()


Body Type                           0
Sex                                 0
Diet                                0
How Often Shower                    0
Heating Energy Source               0
Transport                           0
Vehicle Type                     6721
Social Activity                     0
Monthly Grocery Bill                0
Frequency of Traveling by Air       0
Vehicle Monthly Distance Km         0
Waste Bag Size                      0
Waste Bag Weekly Count              0
How Long TV PC Daily Hour           0
How Many New Clothes Monthly        0
How Long Internet Daily Hour        0
Energy efficiency                   0
Recycling                           0
Cooking_With                        0
CarbonEmission                      0
dtype: int64

In [70]:
# Drop 'Vehicle Type' due to many missing values
df.drop(columns=['Vehicle Type'], inplace=True)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Body Type                      10000 non-null  object
 1   Sex                            10000 non-null  object
 2   Diet                           10000 non-null  object
 3   How Often Shower               10000 non-null  object
 4   Heating Energy Source          10000 non-null  object
 5   Transport                      10000 non-null  object
 6   Social Activity                10000 non-null  object
 7   Monthly Grocery Bill           10000 non-null  int64 
 8   Frequency of Traveling by Air  10000 non-null  object
 9   Vehicle Monthly Distance Km    10000 non-null  int64 
 10  Waste Bag Size                 10000 non-null  object
 11  Waste Bag Weekly Count         10000 non-null  int64 
 12  How Long TV PC Daily Hour      10000 non-null  int64 
 13  Ho

In [72]:
df['Waste Bag Weekly Count'] = df['Waste Bag Weekly Count'].astype(str)

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Body Type                      10000 non-null  object
 1   Sex                            10000 non-null  object
 2   Diet                           10000 non-null  object
 3   How Often Shower               10000 non-null  object
 4   Heating Energy Source          10000 non-null  object
 5   Transport                      10000 non-null  object
 6   Social Activity                10000 non-null  object
 7   Monthly Grocery Bill           10000 non-null  int64 
 8   Frequency of Traveling by Air  10000 non-null  object
 9   Vehicle Monthly Distance Km    10000 non-null  int64 
 10  Waste Bag Size                 10000 non-null  object
 11  Waste Bag Weekly Count         10000 non-null  object
 12  How Long TV PC Daily Hour      10000 non-null  int64 
 13  Ho

In [74]:
df.drop(columns=["Recycling", "Cooking_With"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Body Type                      10000 non-null  object
 1   Sex                            10000 non-null  object
 2   Diet                           10000 non-null  object
 3   How Often Shower               10000 non-null  object
 4   Heating Energy Source          10000 non-null  object
 5   Transport                      10000 non-null  object
 6   Social Activity                10000 non-null  object
 7   Monthly Grocery Bill           10000 non-null  int64 
 8   Frequency of Traveling by Air  10000 non-null  object
 9   Vehicle Monthly Distance Km    10000 non-null  int64 
 10  Waste Bag Size                 10000 non-null  object
 11  Waste Bag Weekly Count         10000 non-null  object
 12  How Long TV PC Daily Hour      10000 non-null  int64 
 13  Ho

In [75]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols


['Body Type',
 'Sex',
 'Diet',
 'How Often Shower',
 'Heating Energy Source',
 'Transport',
 'Social Activity',
 'Frequency of Traveling by Air',
 'Waste Bag Size',
 'Waste Bag Weekly Count',
 'Energy efficiency']

In [76]:
categorical_cols

['Body Type',
 'Sex',
 'Diet',
 'How Often Shower',
 'Heating Energy Source',
 'Transport',
 'Social Activity',
 'Frequency of Traveling by Air',
 'Waste Bag Size',
 'Waste Bag Weekly Count',
 'Energy efficiency']

In [77]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols

['Monthly Grocery Bill',
 'Vehicle Monthly Distance Km',
 'How Long TV PC Daily Hour',
 'How Many New Clothes Monthly',
 'How Long Internet Daily Hour',
 'CarbonEmission']

In [38]:
numerical_cols.remove('CarbonEmission')  # Target variable

In [39]:
# Prepare features and target
X = df.drop(columns=['CarbonEmission', 'Recycling', 'Cooking_With'])
y = df['CarbonEmission']

In [40]:
X.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency
0,overweight,female,pescatarian,daily,coal,public,often,230,frequently,210,large,4,7,26,1,No
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,often,114,rarely,9,extra large,3,9,38,5,No
2,overweight,male,omnivore,more frequently,wood,private,never,138,never,2472,small,1,14,47,6,Sometimes
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes
4,obese,female,vegetarian,daily,coal,private,often,266,very frequently,8457,large,1,3,5,6,Yes


In [41]:
y

0       2238
1       1892
2       2595
3       1074
4       4743
        ... 
9995    2408
9996    3084
9997    2377
9998    4574
9999     826
Name: CarbonEmission, Length: 10000, dtype: int64

In [None]:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)



In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Create a pipeline with preprocessing and SVR model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1))
])

# Train the model
model.fit(X_train, y_train)


In [None]:

# Predict on test set
y_pred = model.predict(X_test)

# Compute performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


In [None]:

import joblib

# Save the trained model
joblib.dump(model, "carbon_footprint_svm.pkl")

print("Model saved as carbon_footprint_svm.pkl")
