In [326]:
import pandas as pd  # Import pandas library for data manipulation and analysis
import matplotlib.pyplot as plt  # Import matplotlib library for creating static, interactive, and animated visualizations
import seaborn as sns  # Import seaborn library for statistical data visualization
import numpy as np  # Import numpy library for numerical computing and array operations


In [327]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    
 
    # Check if random_state is provided
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle indices
    indices = np.random.permutation(len(X))
    
    # Calculate the number of samples in the test set
    test_samples = int(len(X) * test_size)
    
    # Split indices into train and test sets
    test_indices = indices[:test_samples]
    train_indices = indices[test_samples:]
    
    # Split the data into train and test sets based on indices
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test


In [328]:
def mean_absolute_error(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    return mae

def mean_squared_error(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def r_squared(y_true, y_pred):
    y_bar = np.mean(y_true)
    ss_total = np.sum((y_true - y_bar) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    r2_squared = 1 - (ss_residual / ss_total)
    return r2_squared


In [329]:
df=pd.read_csv('Carbon Emission.csv')

In [330]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


In [331]:
df.corr()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,CarbonEmission
Monthly Grocery Bill,1.0,0.015801,0.002343,-0.010318,0.006746,0.012798,0.081587
Vehicle Monthly Distance Km,0.015801,1.0,-0.00173,-0.003943,0.004934,-0.003497,0.594171
Waste Bag Weekly Count,0.002343,-0.00173,1.0,-0.01164,-0.003254,-0.005335,0.159193
How Long TV PC Daily Hour,-0.010318,-0.003943,-0.01164,1.0,0.009414,0.006804,0.012985
How Many New Clothes Monthly,0.006746,0.004934,-0.003254,0.009414,1.0,0.006426,0.198887
How Long Internet Daily Hour,0.012798,-0.003497,-0.005335,0.006804,0.006426,1.0,0.043878
CarbonEmission,0.081587,0.594171,0.159193,0.012985,0.198887,0.043878,1.0


In [332]:
df.shape

(10000, 20)

In [333]:
df.isna().sum()

Body Type                           0
Sex                                 0
Diet                                0
How Often Shower                    0
Heating Energy Source               0
Transport                           0
Vehicle Type                     6721
Social Activity                     0
Monthly Grocery Bill                0
Frequency of Traveling by Air       0
Vehicle Monthly Distance Km         0
Waste Bag Size                      0
Waste Bag Weekly Count              0
How Long TV PC Daily Hour           0
How Many New Clothes Monthly        0
How Long Internet Daily Hour        0
Energy efficiency                   0
Recycling                           0
Cooking_With                        0
CarbonEmission                      0
dtype: int64

In [334]:
df['Vehicle Type']=df['Vehicle Type'].fillna('no_vehicle')

In [335]:

df['total_hrs_on_gadgets']=df['How Long TV PC Daily Hour']+df['How Long Internet Daily Hour']
df=df.drop(columns=['How Long TV PC Daily Hour','How Long Internet Daily Hour'])

In [336]:
# moving CarbonEmission column at last

# Extract the column
column_to_move = df.pop('CarbonEmission')

# Re-insert the column at the last position
df['CarbonEmission'] = column_to_move

In [337]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,4743


In [338]:
df.shape

(10000, 19)

In [339]:
df.corr()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmission
Monthly Grocery Bill,1.0,0.015801,0.002343,0.006746,0.001941,0.081587
Vehicle Monthly Distance Km,0.015801,1.0,-0.00173,0.004934,-0.005239,0.594171
Waste Bag Weekly Count,0.002343,-0.00173,1.0,-0.003254,-0.011909,0.159193
How Many New Clothes Monthly,0.006746,0.004934,-0.003254,1.0,0.011137,0.198887
total_hrs_on_gadgets,0.001941,-0.005239,-0.011909,0.011137,1.0,0.040328
CarbonEmission,0.081587,0.594171,0.159193,0.198887,0.040328,1.0


# Decision Tree

In [340]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        X = (X - X.mean(axis=0)) / X.std(axis=0)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(X) < self.min_samples_split or len(np.unique(y)) == 1:
            return np.mean(y)

        num_features = X.shape[1]
        best_feature, best_threshold = None, None
        best_variance = float('inf')

        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = X[:, feature_index] > threshold

                left_variance = np.var(y[left_indices])
                right_variance = np.var(y[right_indices])

                if np.isnan(left_variance) or np.isnan(right_variance):
                    continue  # Skip this threshold if variance is NaN

                weighted_variance = (len(y[left_indices]) * left_variance + len(y[right_indices]) * right_variance) / len(y)

                if weighted_variance < best_variance:
                    best_feature = feature_index
                    best_threshold = threshold
                    best_variance = weighted_variance

        if best_variance == float('inf') or best_feature is None or best_threshold is None:
            return np.mean(y)

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {'feature_index': best_feature,
                'threshold': best_threshold,
                'left_subtree': left_subtree,
                'right_subtree': right_subtree}

    def predict(self, X):
        X = (X - X.mean(axis=0)) / X.std(axis=0)
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x, node=None):
        if node is None:
            node = self.tree

        if isinstance(node, float):
            return node  # Leaf node, return the mean value

        feature_index = node['feature_index']
        threshold = node['threshold']

        if x[feature_index] <= threshold:
            return self._predict_single(x, node['left_subtree'])
        else:
            return self._predict_single(x, node['right_subtree'])


# Applying Decision Tree without doing feature engineering


In [341]:
numeric_columns = df.select_dtypes(exclude=['object'])

numeric_columns

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmission
0,230,210,4,26,8,2238
1,114,9,3,38,14,1892
2,138,2472,1,47,20,2595
3,157,74,3,5,27,1074
4,266,8457,1,5,9,4743
...,...,...,...,...,...,...
9995,230,268,5,27,21,2408
9996,234,5316,3,8,38,3084
9997,298,96,5,5,35,2377
9998,179,8688,5,14,24,4574


In [342]:
X=numeric_columns.iloc[:,0:5].values

In [343]:
X

array([[ 230,  210,    4,   26,    8],
       [ 114,    9,    3,   38,   14],
       [ 138, 2472,    1,   47,   20],
       ...,
       [ 298,   96,    5,    5,   35],
       [ 179, 8688,    5,   14,   24],
       [ 115, 9952,    4,    6,   11]], dtype=int64)

In [344]:
Y=numeric_columns.iloc[:,-1].values


In [345]:
Y

array([2238, 1892, 2595, ..., 2377, 4574,  826], dtype=int64)

In [346]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, 0.2,10)

In [348]:


decision_tree = DecisionTreeRegressor( max_depth=10, min_samples_split=15)


# Fit the models
decision_tree.fit(X_train, y_train)

# Make predictions
y_pred = decision_tree.predict(X_test)

In [349]:
# Assuming y_true contains the true target values and y_pred contains the predicted values
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Compute Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred)

# Compute R-squared (R²)
r2_squared = r_squared(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R²):", r2_squared)

Mean Absolute Error: 679.6970541955169
Mean Squared Error: 782539.709019332
Root Mean Squared Error: 884.612745227725
R-squared (R²): 0.23038842270405813


# Linear Regression after feature engineering (converting all categorical column to numerical column)

In [350]:
df2=df

In [351]:
df2.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,4743


In [352]:
df2.isna().sum()

Body Type                        0
Sex                              0
Diet                             0
How Often Shower                 0
Heating Energy Source            0
Transport                        0
Vehicle Type                     0
Social Activity                  0
Monthly Grocery Bill             0
Frequency of Traveling by Air    0
Vehicle Monthly Distance Km      0
Waste Bag Size                   0
Waste Bag Weekly Count           0
How Many New Clothes Monthly     0
Energy efficiency                0
Recycling                        0
Cooking_With                     0
total_hrs_on_gadgets             0
CarbonEmission                   0
dtype: int64

In [353]:
Recycling_ = set()
for row in df2['Recycling']:
    names = row.strip('[]').split(',')
    Recycling_.update([name.strip() for name in names])

print("Unique names:", Recycling_)

Unique names: {"'Plastic'", '', "'Glass'", "'Metal'", "'Paper'"}


In [354]:
Cooking_With_ = set()
for row in df2['Cooking_With']:
    names = row.strip('[]').split(',')
    Cooking_With_.update([name.strip() for name in names])

print("Unique names:", Cooking_With_)

Unique names: {'', "'Stove'", "'Oven'", "'Microwave'", "'Grill'", "'Airfryer'"}


In [355]:
unique_names = ['Microwave','Airfryer','Oven','Stove','Grill']

for name in unique_names:
    df2[name] = df2['Cooking_With'].str.contains(name).astype(int)

df2.drop('Cooking_With', axis=1, inplace=True)

In [356]:
df2.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,...,How Many New Clothes Monthly,Energy efficiency,Recycling,total_hrs_on_gadgets,CarbonEmission,Microwave,Airfryer,Oven,Stove,Grill
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,...,26,No,['Metal'],8,2238,0,0,1,1,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,...,38,No,['Metal'],14,1892,1,0,0,1,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,...,47,Sometimes,['Metal'],20,2595,1,0,1,0,0
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,...,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']",27,1074,1,1,0,0,1
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,...,5,Yes,['Paper'],9,4743,0,0,1,0,0


In [357]:
unique_names = ['Plastic','Paper','Metal','Glass']

for name in unique_names:
    df2[name] = df2['Recycling'].str.contains(name).astype(int)

df2.drop('Recycling', axis=1, inplace=True)

In [358]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,...,CarbonEmission,Microwave,Airfryer,Oven,Stove,Grill,Plastic,Paper,Metal,Glass
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,...,2238,0,0,1,1,0,0,0,1,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,...,1892,1,0,0,1,0,0,0,1,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,...,2595,1,0,1,0,0,0,0,1,0
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,...,1074,1,1,0,0,1,1,1,1,1
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,...,4743,0,0,1,0,0,0,1,0,0


In [359]:
columns_to_encode = ['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source',
       'Transport', 'Vehicle Type', 'Social Activity',
       'Frequency of Traveling by Air',
       'Waste Bag Size',
       'Energy efficiency']  # Add your column names here
mapping_dict = {}

# Manually encode categorical columns
for col in columns_to_encode:
    # Create a dictionary to map categories to numerical labels
    label_map = {val: idx for idx, val in enumerate(df2[col].unique())}
    
    # Update the mapping dictionary
    mapping_dict[col] = label_map
    
    # Apply the mapping to the column
    df2[col + '_encoded'] = df2[col].map(label_map)

    # Drop the original column
    df2.drop(columns=[col], inplace=True)


# Display the mapping dictionary
print("\nMapping Dictionary:")
for col, mapping in mapping_dict.items():
    print(f"{col}: {mapping}")



Mapping Dictionary:
Body Type: {'overweight': 0, 'obese': 1, 'underweight': 2, 'normal': 3}
Sex: {'female': 0, 'male': 1}
Diet: {'pescatarian': 0, 'vegetarian': 1, 'omnivore': 2, 'vegan': 3}
How Often Shower: {'daily': 0, 'less frequently': 1, 'more frequently': 2, 'twice a day': 3}
Heating Energy Source: {'coal': 0, 'natural gas': 1, 'wood': 2, 'electricity': 3}
Transport: {'public': 0, 'walk/bicycle': 1, 'private': 2}
Vehicle Type: {'no_vehicle': 0, 'petrol': 1, 'diesel': 2, 'hybrid': 3, 'lpg': 4, 'electric': 5}
Social Activity: {'often': 0, 'never': 1, 'sometimes': 2}
Frequency of Traveling by Air: {'frequently': 0, 'rarely': 1, 'never': 2, 'very frequently': 3}
Waste Bag Size: {'large': 0, 'extra large': 1, 'small': 2, 'medium': 3}
Energy efficiency: {'No': 0, 'Sometimes': 1, 'Yes': 2}


In [360]:
df2.head()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmission,Microwave,Airfryer,Oven,Stove,...,Sex_encoded,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded
0,230,210,4,26,8,2238,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,114,9,3,38,14,1892,1,0,0,1,...,0,1,1,1,1,0,0,1,1,0
2,138,2472,1,47,20,2595,1,0,1,0,...,1,2,2,2,2,1,1,2,2,1
3,157,74,3,5,27,1074,1,1,0,0,...,1,2,3,2,1,0,2,1,3,1
4,266,8457,1,5,9,4743,0,0,1,0,...,0,1,0,0,2,2,0,3,0,2


In [361]:
# moving CarbonEmission column at last

# Extract the column
column_to_move = df2.pop('CarbonEmission')

# Re-insert the column at the last position
df2['CarbonEmission'] = column_to_move

In [362]:
df2.head()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,Microwave,Airfryer,Oven,Stove,Grill,...,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded,CarbonEmission
0,230,210,4,26,8,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,2238
1,114,9,3,38,14,1,0,0,1,0,...,1,1,1,1,0,0,1,1,0,1892
2,138,2472,1,47,20,1,0,1,0,0,...,2,2,2,2,1,1,2,2,1,2595
3,157,74,3,5,27,1,1,0,0,1,...,2,3,2,1,0,2,1,3,1,1074
4,266,8457,1,5,9,0,0,1,0,0,...,1,0,0,2,2,0,3,0,2,4743


In [363]:
from sklearn.linear_model import LinearRegression




In [364]:
df2.shape

(10000, 26)

In [365]:
X = df2.iloc[:, 0:24].values

In [366]:
X

array([[ 230,  210,    4, ...,    0,    0,    0],
       [ 114,    9,    3, ...,    0,    1,    1],
       [ 138, 2472,    1, ...,    1,    2,    2],
       ...,
       [ 298,   96,    5, ...,    2,    3,    1],
       [ 179, 8688,    5, ...,    0,    1,    3],
       [ 115, 9952,    4, ...,    2,    2,    2]], dtype=int64)

In [367]:
Y=df2.iloc[:,-1].values

In [368]:
Y

array([2238, 1892, 2595, ..., 2377, 4574,  826], dtype=int64)

In [369]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,0.2,10)

In [372]:

decision_tree = DecisionTreeRegressor( max_depth=10, min_samples_split=15)


# Fit the models
decision_tree.fit(X_train, y_train)

# Make predictions
y_pred = decision_tree.predict(X_test)

In [373]:
# Assuming y_true contains the true target values and y_pred contains the predicted values
# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Compute Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred)

# Compute R-squared (R²)
r2_squared = r_squared(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R²):", r2_squared)

Mean Absolute Error: 697.7153691438059
Mean Squared Error: 840922.4595437503
Root Mean Squared Error: 917.0182438445541
R-squared (R²): 0.1729701981716809
