In [67]:
import pandas as pd  # Import pandas library for data manipulation and analysis
import matplotlib.pyplot as plt  # Import matplotlib library for creating static, interactive, and animated visualizations
import seaborn as sns  # Import seaborn library for statistical data visualization
import numpy as np  # Import numpy library for numerical computing and array operations


In [68]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    
 
    # Check if random_state is provided
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle indices
    indices = np.random.permutation(len(X))
    
    # Calculate the number of samples in the test set
    test_samples = int(len(X) * test_size)
    
    # Split indices into train and test sets
    test_indices = indices[:test_samples]
    train_indices = indices[test_samples:]
    
    # Split the data into train and test sets based on indices
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test


In [69]:
def mean_absolute_error(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    return mae

def mean_squared_error(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    return mse

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def r_squared(y_true, y_pred):
    y_bar = np.mean(y_true)
    ss_total = np.sum((y_true - y_bar) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    r_squared = 1 - (ss_residual / ss_total)
    return r_squared


In [70]:
df=pd.read_csv('Carbon_Emission.csv')

In [71]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,Energy efficiency,Recycling,Cooking_With,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,,often,230,frequently,210,large,4,7,26,1,No,['Metal'],"['Stove', 'Oven']",2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,,often,114,rarely,9,extra large,3,9,38,5,No,['Metal'],"['Stove', 'Microwave']",1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,14,47,6,Sometimes,['Metal'],"['Oven', 'Microwave']",2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,,sometimes,157,rarely,74,medium,3,20,5,7,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,3,5,6,Yes,['Paper'],['Oven'],4743


In [72]:
df.corr()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Long TV PC Daily Hour,How Many New Clothes Monthly,How Long Internet Daily Hour,CarbonEmission
Monthly Grocery Bill,1.0,0.015801,0.002343,-0.010318,0.006746,0.012798,0.081587
Vehicle Monthly Distance Km,0.015801,1.0,-0.00173,-0.003943,0.004934,-0.003497,0.594171
Waste Bag Weekly Count,0.002343,-0.00173,1.0,-0.01164,-0.003254,-0.005335,0.159193
How Long TV PC Daily Hour,-0.010318,-0.003943,-0.01164,1.0,0.009414,0.006804,0.012985
How Many New Clothes Monthly,0.006746,0.004934,-0.003254,0.009414,1.0,0.006426,0.198887
How Long Internet Daily Hour,0.012798,-0.003497,-0.005335,0.006804,0.006426,1.0,0.043878
CarbonEmission,0.081587,0.594171,0.159193,0.012985,0.198887,0.043878,1.0


In [73]:
df.shape

(10000, 20)

In [74]:
df.isna().sum()

Body Type                           0
Sex                                 0
Diet                                0
How Often Shower                    0
Heating Energy Source               0
Transport                           0
Vehicle Type                     6721
Social Activity                     0
Monthly Grocery Bill                0
Frequency of Traveling by Air       0
Vehicle Monthly Distance Km         0
Waste Bag Size                      0
Waste Bag Weekly Count              0
How Long TV PC Daily Hour           0
How Many New Clothes Monthly        0
How Long Internet Daily Hour        0
Energy efficiency                   0
Recycling                           0
Cooking_With                        0
CarbonEmission                      0
dtype: int64

In [75]:
df['Vehicle Type']=df['Vehicle Type'].fillna('no_vehicle')

In [76]:

df['total_hrs_on_gadgets']=df['How Long TV PC Daily Hour']+df['How Long Internet Daily Hour']
df=df.drop(columns=['How Long TV PC Daily Hour','How Long Internet Daily Hour'])

In [77]:
# moving CarbonEmission column at last

# Extract the column
column_to_move = df.pop('CarbonEmission')

# Re-insert the column at the last position
df['CarbonEmission'] = column_to_move

In [78]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmission
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,2238
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,1892
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,2595
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,1074
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,4743


In [79]:
df.shape

(10000, 19)

In [80]:
df.describe()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmission
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,173.8752,2031.4859,4.0246,25.109,24.0283,2269.1473
std,72.234018,2769.715597,1.990375,14.698725,10.205984,1017.675247
min,50.0,0.0,1.0,0.0,0.0,306.0
25%,111.0,69.0,2.0,13.0,17.0,1538.0
50%,173.0,823.0,4.0,25.0,24.0,2080.0
75%,237.0,2516.75,6.0,38.0,31.0,2768.0
max,299.0,9999.0,7.0,50.0,48.0,8377.0


In [81]:
conditions = [
    (df['CarbonEmission'] < 1500),
    (df['CarbonEmission'] >= 1500) & (df['CarbonEmission'] < 2500),  # Add a comma here
    (df['CarbonEmission'] >= 2500) & (df['CarbonEmission'] < 3500)
]
values = ['low', 'average', 'high']

# Apply the conditions and assign values to the new column
df['CarbonEmissionCategory'] = np.select(conditions, values, default='very_high')


In [82]:
df.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmission,CarbonEmissionCategory
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,2238,average
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,1892,average
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,2595,high
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,1074,low
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,4743,very_high


In [84]:
df

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmissionCategory
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,average
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,average
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,high
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,low
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,very_high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,27,Yes,[],['Microwave'],21,average
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,8,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']",38,high
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,no_vehicle,sometimes,298,very frequently,96,extra large,5,5,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",35,average
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,14,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",24,very_high


In [85]:
columns_to_encode = ['CarbonEmissionCategory']  # Add your column names here
mapping_dict = {}

# Manually encode categorical columns
for col in columns_to_encode:
    # Create a dictionary to map categories to numerical labels
    label_map = {val: idx for idx, val in enumerate(df[col].unique())}
    
    # Update the mapping dictionary
    mapping_dict[col] = label_map
    
    # Apply the mapping to the column
    df[col+'_'] = df[col].map(label_map)

    # Drop the original column
    df.drop(columns=[col], inplace=True)


# Display the mapping dictionary
print("\nMapping Dictionary:")
for col, mapping in mapping_dict.items():
    print(f"{col}: {mapping}")



Mapping Dictionary:
CarbonEmissionCategory: {'average': 0, 'high': 1, 'low': 2, 'very_high': 3}


# Logistic Regression

In [86]:
df


Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmissionCategory_
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,1
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,2
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,27,Yes,[],['Microwave'],21,0
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,8,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']",38,1
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,no_vehicle,sometimes,298,very frequently,96,extra large,5,5,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",35,0
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,14,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",24,3


# Applying Linear Regression without doing feature engineering


In [87]:
df

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmissionCategory_
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,1
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,2
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,obese,male,omnivore,twice a day,coal,private,hybrid,sometimes,230,never,268,medium,5,27,Yes,[],['Microwave'],21,0
9996,normal,female,vegan,twice a day,coal,private,lpg,never,234,frequently,5316,extra large,3,8,Sometimes,"['Paper', 'Plastic']","['Stove', 'Microwave']",38,1
9997,overweight,female,vegetarian,daily,electricity,walk/bicycle,no_vehicle,sometimes,298,very frequently,96,extra large,5,5,Yes,"['Paper', 'Plastic', 'Metal']","['Microwave', 'Grill', 'Airfryer']",35,0
9998,underweight,male,vegan,more frequently,coal,private,petrol,often,179,rarely,8688,medium,5,14,Sometimes,"['Paper', 'Metal']","['Stove', 'Microwave', 'Grill', 'Airfryer']",24,3


In [88]:
numeric_columns = df.select_dtypes(exclude=['object'])

numeric_columns

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmissionCategory_
0,230,210,4,26,8,0
1,114,9,3,38,14,0
2,138,2472,1,47,20,1
3,157,74,3,5,27,2
4,266,8457,1,5,9,3
...,...,...,...,...,...,...
9995,230,268,5,27,21,0
9996,234,5316,3,8,38,1
9997,298,96,5,5,35,0
9998,179,8688,5,14,24,3


In [89]:
X=numeric_columns.iloc[:,0:5].values

In [90]:
X = (X - X.mean(axis=0)) / X.std(axis=0)

In [91]:
Y=numeric_columns.iloc[:,-1].values


In [92]:
Y

array([0, 0, 1, ..., 0, 3, 2])

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, 0.2,10)

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Initialize the logistic regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Fit the model to the training data
log_reg.fit(X_train, y_train)

# Predict on the testing data
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.503
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.83      0.61       830
           1       0.27      0.03      0.05       464
           2       0.63      0.35      0.45       499
           3       0.53      0.63      0.57       207

    accuracy                           0.50      2000
   macro avg       0.48      0.46      0.42      2000
weighted avg       0.47      0.50      0.44      2000



# Linear Regression after feature engineering (converting all categorical column to numerical column)

In [95]:
df2=df

In [96]:
df2.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,Vehicle Monthly Distance Km,Waste Bag Size,Waste Bag Weekly Count,How Many New Clothes Monthly,Energy efficiency,Recycling,Cooking_With,total_hrs_on_gadgets,CarbonEmissionCategory_
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,210,large,4,26,No,['Metal'],"['Stove', 'Oven']",8,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,9,extra large,3,38,No,['Metal'],"['Stove', 'Microwave']",14,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,2472,small,1,47,Sometimes,['Metal'],"['Oven', 'Microwave']",20,1
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,74,medium,3,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']","['Microwave', 'Grill', 'Airfryer']",27,2
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,8457,large,1,5,Yes,['Paper'],['Oven'],9,3


In [97]:
df2.isna().sum()

Body Type                        0
Sex                              0
Diet                             0
How Often Shower                 0
Heating Energy Source            0
Transport                        0
Vehicle Type                     0
Social Activity                  0
Monthly Grocery Bill             0
Frequency of Traveling by Air    0
Vehicle Monthly Distance Km      0
Waste Bag Size                   0
Waste Bag Weekly Count           0
How Many New Clothes Monthly     0
Energy efficiency                0
Recycling                        0
Cooking_With                     0
total_hrs_on_gadgets             0
CarbonEmissionCategory_          0
dtype: int64

In [98]:
Recycling_ = set()
for row in df2['Recycling']:
    names = row.strip('[]').split(',')
    Recycling_.update([name.strip() for name in names])

print("Unique names:", Recycling_)

Unique names: {'', "'Plastic'", "'Paper'", "'Metal'", "'Glass'"}


In [99]:
Cooking_With_ = set()
for row in df2['Cooking_With']:
    names = row.strip('[]').split(',')
    Cooking_With_.update([name.strip() for name in names])

print("Unique names:", Cooking_With_)

Unique names: {'', "'Oven'", "'Microwave'", "'Airfryer'", "'Grill'", "'Stove'"}


In [100]:
unique_names = ['Microwave','Airfryer','Oven','Stove','Grill']

for name in unique_names:
    df2[name] = df2['Cooking_With'].str.contains(name).astype(int)

df2.drop('Cooking_With', axis=1, inplace=True)

In [101]:
df2.head()

Unnamed: 0,Body Type,Sex,Diet,How Often Shower,Heating Energy Source,Transport,Vehicle Type,Social Activity,Monthly Grocery Bill,Frequency of Traveling by Air,...,How Many New Clothes Monthly,Energy efficiency,Recycling,total_hrs_on_gadgets,CarbonEmissionCategory_,Microwave,Airfryer,Oven,Stove,Grill
0,overweight,female,pescatarian,daily,coal,public,no_vehicle,often,230,frequently,...,26,No,['Metal'],8,0,0,0,1,1,0
1,obese,female,vegetarian,less frequently,natural gas,walk/bicycle,no_vehicle,often,114,rarely,...,38,No,['Metal'],14,0,1,0,0,1,0
2,overweight,male,omnivore,more frequently,wood,private,petrol,never,138,never,...,47,Sometimes,['Metal'],20,1,1,0,1,0,0
3,overweight,male,omnivore,twice a day,wood,walk/bicycle,no_vehicle,sometimes,157,rarely,...,5,Sometimes,"['Paper', 'Plastic', 'Glass', 'Metal']",27,2,1,1,0,0,1
4,obese,female,vegetarian,daily,coal,private,diesel,often,266,very frequently,...,5,Yes,['Paper'],9,3,0,0,1,0,0


In [102]:
unique_names = ['Plastic','Paper','Metal','Glass']

for name in unique_names:
    df2[name] = df2['Recycling'].str.contains(name).astype(int)

df2.drop('Recycling', axis=1, inplace=True)

In [103]:
df2.columns

Index(['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source',
       'Transport', 'Vehicle Type', 'Social Activity', 'Monthly Grocery Bill',
       'Frequency of Traveling by Air', 'Vehicle Monthly Distance Km',
       'Waste Bag Size', 'Waste Bag Weekly Count',
       'How Many New Clothes Monthly', 'Energy efficiency',
       'total_hrs_on_gadgets', 'CarbonEmissionCategory_', 'Microwave',
       'Airfryer', 'Oven', 'Stove', 'Grill', 'Plastic', 'Paper', 'Metal',
       'Glass'],
      dtype='object')

In [104]:
columns_to_encode = ['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source',
       'Transport', 'Vehicle Type', 'Social Activity',
       'Frequency of Traveling by Air',
       'Waste Bag Size',
       'Energy efficiency']  # Add your column names here
mapping_dict = {}

# Manually encode categorical columns
for col in columns_to_encode:
    # Create a dictionary to map categories to numerical labels
    label_map = {val: idx for idx, val in enumerate(df2[col].unique())}
    
    # Update the mapping dictionary
    mapping_dict[col] = label_map
    
    # Apply the mapping to the column
    df2[col + '_encoded'] = df2[col].map(label_map)

    # Drop the original column
    df2.drop(columns=[col], inplace=True)


# Display the mapping dictionary
print("\nMapping Dictionary:")
for col, mapping in mapping_dict.items():
    print(f"{col}: {mapping}")



Mapping Dictionary:
Body Type: {'overweight': 0, 'obese': 1, 'underweight': 2, 'normal': 3}
Sex: {'female': 0, 'male': 1}
Diet: {'pescatarian': 0, 'vegetarian': 1, 'omnivore': 2, 'vegan': 3}
How Often Shower: {'daily': 0, 'less frequently': 1, 'more frequently': 2, 'twice a day': 3}
Heating Energy Source: {'coal': 0, 'natural gas': 1, 'wood': 2, 'electricity': 3}
Transport: {'public': 0, 'walk/bicycle': 1, 'private': 2}
Vehicle Type: {'no_vehicle': 0, 'petrol': 1, 'diesel': 2, 'hybrid': 3, 'lpg': 4, 'electric': 5}
Social Activity: {'often': 0, 'never': 1, 'sometimes': 2}
Frequency of Traveling by Air: {'frequently': 0, 'rarely': 1, 'never': 2, 'very frequently': 3}
Waste Bag Size: {'large': 0, 'extra large': 1, 'small': 2, 'medium': 3}
Energy efficiency: {'No': 0, 'Sometimes': 1, 'Yes': 2}


In [112]:
df2.head()

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmissionCategory_,Microwave,Airfryer,Oven,Stove,...,Sex_encoded,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded
0,230,210,4,26,8,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,114,9,3,38,14,0,1,0,0,1,...,0,1,1,1,1,0,0,1,1,0
2,138,2472,1,47,20,1,1,0,1,0,...,1,2,2,2,2,1,1,2,2,1
3,157,74,3,5,27,2,1,1,0,0,...,1,2,3,2,1,0,2,1,3,1
4,266,8457,1,5,9,3,0,0,1,0,...,0,1,0,0,2,2,0,3,0,2


In [113]:
# moving CarbonEmission column at last

# Extract the column
column_to_move = df2.pop('CarbonEmissionCategory_')

# Re-insert the column at the last position
df2['CarbonEmissionCategory_'] = column_to_move

In [114]:
X = df2.iloc[:, 0:25].values
X

array([[ 230,  210,    4, ...,    0,    0,    0],
       [ 114,    9,    3, ...,    1,    1,    0],
       [ 138, 2472,    1, ...,    2,    2,    1],
       ...,
       [ 298,   96,    5, ...,    3,    1,    2],
       [ 179, 8688,    5, ...,    1,    3,    1],
       [ 115, 9952,    4, ...,    2,    2,    1]])

In [115]:
X = (X - X.mean(axis=0)) / X.std(axis=0)

In [116]:
Y=df2.iloc[:,-1].values

In [117]:
Y

array([0, 0, 1, ..., 0, 3, 2])

In [118]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,0.2,10)

In [119]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Initialize the logistic regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Fit the model to the training data
log_reg.fit(X_train, y_train)

# Predict on the testing data
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.591
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.77      0.65       830
           1       0.56      0.42      0.48       464
           2       0.67      0.42      0.52       499
           3       0.71      0.67      0.69       207

    accuracy                           0.59      2000
   macro avg       0.63      0.57      0.58      2000
weighted avg       0.60      0.59      0.58      2000



In [61]:
df2

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmissionCategory_,Microwave,Airfryer,Oven,Stove,...,Sex_encoded,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded
0,230,210,4,26,8,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,114,9,3,38,14,0,1,0,0,1,...,0,1,1,1,1,0,0,1,1,0
2,138,2472,1,47,20,1,1,0,1,0,...,1,2,2,2,2,1,1,2,2,1
3,157,74,3,5,27,2,1,1,0,0,...,1,2,3,2,1,0,2,1,3,1
4,266,8457,1,5,9,3,0,0,1,0,...,0,1,0,0,2,2,0,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,230,268,5,27,21,0,1,0,0,0,...,1,2,3,0,2,3,2,2,3,2
9996,234,5316,3,8,38,1,1,0,0,1,...,0,3,3,0,2,4,1,0,1,1
9997,298,96,5,5,35,0,1,1,0,0,...,0,1,0,3,1,0,2,3,1,2
9998,179,8688,5,14,24,3,1,1,0,1,...,1,3,2,0,2,1,0,1,3,1


In [63]:
df2.shape

(10000, 26)

In [64]:
df.shape

(10000, 26)

In [65]:
df

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmissionCategory_,Microwave,Airfryer,Oven,Stove,...,Sex_encoded,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded
0,230,210,4,26,8,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,114,9,3,38,14,0,1,0,0,1,...,0,1,1,1,1,0,0,1,1,0
2,138,2472,1,47,20,1,1,0,1,0,...,1,2,2,2,2,1,1,2,2,1
3,157,74,3,5,27,2,1,1,0,0,...,1,2,3,2,1,0,2,1,3,1
4,266,8457,1,5,9,3,0,0,1,0,...,0,1,0,0,2,2,0,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,230,268,5,27,21,0,1,0,0,0,...,1,2,3,0,2,3,2,2,3,2
9996,234,5316,3,8,38,1,1,0,0,1,...,0,3,3,0,2,4,1,0,1,1
9997,298,96,5,5,35,0,1,1,0,0,...,0,1,0,3,1,0,2,3,1,2
9998,179,8688,5,14,24,3,1,1,0,1,...,1,3,2,0,2,1,0,1,3,1


In [66]:
df2

Unnamed: 0,Monthly Grocery Bill,Vehicle Monthly Distance Km,Waste Bag Weekly Count,How Many New Clothes Monthly,total_hrs_on_gadgets,CarbonEmissionCategory_,Microwave,Airfryer,Oven,Stove,...,Sex_encoded,Diet_encoded,How Often Shower_encoded,Heating Energy Source_encoded,Transport_encoded,Vehicle Type_encoded,Social Activity_encoded,Frequency of Traveling by Air_encoded,Waste Bag Size_encoded,Energy efficiency_encoded
0,230,210,4,26,8,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,114,9,3,38,14,0,1,0,0,1,...,0,1,1,1,1,0,0,1,1,0
2,138,2472,1,47,20,1,1,0,1,0,...,1,2,2,2,2,1,1,2,2,1
3,157,74,3,5,27,2,1,1,0,0,...,1,2,3,2,1,0,2,1,3,1
4,266,8457,1,5,9,3,0,0,1,0,...,0,1,0,0,2,2,0,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,230,268,5,27,21,0,1,0,0,0,...,1,2,3,0,2,3,2,2,3,2
9996,234,5316,3,8,38,1,1,0,0,1,...,0,3,3,0,2,4,1,0,1,1
9997,298,96,5,5,35,0,1,1,0,0,...,0,1,0,3,1,0,2,3,1,2
9998,179,8688,5,14,24,3,1,1,0,1,...,1,3,2,0,2,1,0,1,3,1
