#Imports of data and important libraries

In [1]:
import pandas as pd
import numpy as np
import math
import networkx as nx

In [2]:
data = pd.read_csv("DATA pswd.csv")

In [3]:
data

Unnamed: 0,Date,Start,End,Travel time,Type
0,1/2/2019 5:54,Americium,Actinium,291,Car
1,1/8/2019 14:40,Americium,Actinium,245,Car
2,2/1/2019 11:01,Americium,Actinium,320,Car
3,2/1/2019 15:59,Americium,Actinium,571,Car
4,2/8/2019 11:38,Americium,Actinium,272,Car
...,...,...,...,...,...
504578,31/08/2023 14:20,Yttrium,Zirconium,285,Pedestrian
504579,31/08/2023 14:32,Yttrium,Zirconium,259,Pedestrian
504580,31/10/2022 11:29,Yttrium,Zirconium,303,Pedestrian
504581,31/12/2021 7:18,Yttrium,Zirconium,264,Pedestrian


#Data manipulatiopn

In [4]:
# Define bus stops
bus_stops = [
    "Beryllium", "Neon", "Sulfur", "Titanium", "Nickel", "Selenium",
    "Zirconium", "Palladium", "Tellurium", "Cerium", "Gadolinium",
    "Ytterbium", "Osmium", "Lead", "Radium", "Plutonium"
]


# Function to determine pollution and if on a bus
def calculate_pollution(row):
    pollution = 0
    on_bus = 0
    if row['Type'] == 'Car':
        pollution = (row['Travel time'] / 100) * 5
    elif row['Type'] == 'Pedestrian':
        if row['Start'] in bus_stops and row['End'] in bus_stops and row['Travel time']==240:
            on_bus = 1
            pollution = (row['Travel time'] / 100) * 1
    return pd.Series([pollution, on_bus])

# Apply the function to the dataset
data[['Pollution', 'OnBus']] = data.apply(calculate_pollution, axis=1)

In [5]:
#Howe many values we have for buses
data['OnBus'].value_counts()

OnBus
0.0    504533
1.0        50
Name: count, dtype: int64

In [6]:
#How many unique values we have for pollution (checking for integrity)
data['Pollution'].nunique()

726

In [7]:
# Convert 'Date' to datetime and extract features so that we can use them in our model's training
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y %H:%M')
data['Hour'] = data['Date'].dt.hour
data['Minute'] = data['Date'].dt.minute
data['DayOfWeek'] = data['Date'].dt.dayofweek

In [8]:
data

Unnamed: 0,Date,Start,End,Travel time,Type,Pollution,OnBus,Hour,Minute,DayOfWeek
0,2019-02-01 05:54:00,Americium,Actinium,291,Car,14.55,0.0,5,54,4
1,2019-08-01 14:40:00,Americium,Actinium,245,Car,12.25,0.0,14,40,3
2,2019-01-02 11:01:00,Americium,Actinium,320,Car,16.00,0.0,11,1,2
3,2019-01-02 15:59:00,Americium,Actinium,571,Car,28.55,0.0,15,59,2
4,2019-08-02 11:38:00,Americium,Actinium,272,Car,13.60,0.0,11,38,4
...,...,...,...,...,...,...,...,...,...,...
504578,2023-08-31 14:20:00,Yttrium,Zirconium,285,Pedestrian,0.00,0.0,14,20,3
504579,2023-08-31 14:32:00,Yttrium,Zirconium,259,Pedestrian,0.00,0.0,14,32,3
504580,2022-10-31 11:29:00,Yttrium,Zirconium,303,Pedestrian,0.00,0.0,11,29,0
504581,2021-12-31 07:18:00,Yttrium,Zirconium,264,Pedestrian,0.00,0.0,7,18,4


#Training

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical data
label_encoder = LabelEncoder()
data['Start_encoded'] = label_encoder.fit_transform(data['Start'])
data['End_encoded'] = label_encoder.fit_transform(data['End'])
data['Type_encoded'] = label_encoder.fit_transform(data['Type'])

# Select relevant columns
features = data[['Start_encoded', 'End_encoded', 'Type_encoded','Hour','Minute','DayOfWeek','OnBus']]
target = data['Pollution']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [10]:
data

Unnamed: 0,Date,Start,End,Travel time,Type,Pollution,OnBus,Hour,Minute,DayOfWeek,Start_encoded,End_encoded,Type_encoded
0,2019-02-01 05:54:00,Americium,Actinium,291,Car,14.55,0.0,5,54,4,2,0,0
1,2019-08-01 14:40:00,Americium,Actinium,245,Car,12.25,0.0,14,40,3,2,0,0
2,2019-01-02 11:01:00,Americium,Actinium,320,Car,16.00,0.0,11,1,2,2,0,0
3,2019-01-02 15:59:00,Americium,Actinium,571,Car,28.55,0.0,15,59,2,2,0,0
4,2019-08-02 11:38:00,Americium,Actinium,272,Car,13.60,0.0,11,38,4,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
504578,2023-08-31 14:20:00,Yttrium,Zirconium,285,Pedestrian,0.00,0.0,14,20,3,93,95,1
504579,2023-08-31 14:32:00,Yttrium,Zirconium,259,Pedestrian,0.00,0.0,14,32,3,93,95,1
504580,2022-10-31 11:29:00,Yttrium,Zirconium,303,Pedestrian,0.00,0.0,11,29,0,93,95,1
504581,2021-12-31 07:18:00,Yttrium,Zirconium,264,Pedestrian,0.00,0.0,7,18,4,93,95,1


#The model

In [11]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import math

# Initialize the model
model = xgb.XGBRegressor(n_estimators=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the training set
y_train_pred = model.predict(X_train)

# Predict on the test set
y_test_pred = model.predict(X_test)

# Evaluate the model on the training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = math.sqrt(mse_train)
print(f"Train Mean Squared Error: {mse_train}")
print(f"Train Root Mean Squared Error: {rmse_train}")

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = math.sqrt(mse_test)
print(f"Test Mean Squared Error: {mse_test}")
print(f"Test Root Mean Squared Error: {rmse_test}")

Train Mean Squared Error: 0.736827077373534
Train Root Mean Squared Error: 0.8583863217535179
Test Mean Squared Error: 0.9327001581711287
Test Root Mean Squared Error: 0.9657640282031262


In [12]:
G = nx.DiGraph()
for index, row in data.iterrows():
    G.add_edge(row['Start_encoded'], row['End_encoded'], weight=row['Pollution'], type=row['Type_encoded'])

# Function to predict pollution using encoded features directly
def predict_pollution(row):
    pollution = model.predict([row])[0]
    return max(pollution, 0)  # Ensure no negative predictions

# Find the shortest path based on predicted pollution using encoded features
def find_optimal_path(start_encoded, end_encoded):
    shortest_path = nx.shortest_path(G, source=start_encoded, target=end_encoded, weight='weight')
    total_pollution = 0
    path_details = []
    for i in range(len(shortest_path) - 1):
        s = shortest_path[i]
        e = shortest_path[i + 1]
        edge_data = G.get_edge_data(s, e)
        travel_type_encoded = edge_data['type']
        # Create the feature vector for prediction
        row = [s, e, travel_type_encoded, data['Hour'].mean(), data['Minute'].mean(), data['DayOfWeek'].mean(), data['OnBus'].mean()]
        pollution = predict_pollution(row)
        total_pollution += pollution
        path_details.append((s, e, travel_type_encoded, pollution))
    return path_details, total_pollution


In [None]:
#Beryllium, Titanium

import tkinter as tk
from tkinter import messagebox

def on_go_button_clicked():
    source = source_entry.get()
    destination = destination_entry.get()

    
    if not source or not destination:
        messagebox.showerror("Input Error", "Please enter both source and destination")
        return
    
    start_point_encoded = data[data['Start'] == source]['Start_encoded'].values[0]
    end_point_encoded = data[data['End'] == destination]['End_encoded'].values[0]
    print(f"Source encoded: {start_point_encoded}")
    print(f"Destination encoded: {end_point_encoded}")
    print(f"In this code we check the greenest path\n")
    optimal_path, total_pollution = find_optimal_path(start_point_encoded, end_point_encoded)
    print("Optimal Path (encoded):", optimal_path)
    print("Total Pollution:", total_pollution)

    # If you want to decode the optimal path to original values
    label_encoder = LabelEncoder()
    label_encoder.fit(pd.concat([data['Start'], data['End']]))
    decoded_path = [(label_encoder.inverse_transform([s])[0], label_encoder.inverse_transform([e])[0], t, tt) for s, e, t, tt in optimal_path]
    print("Optimal Path (decoded):", decoded_path)
    # Save inputs to variables
    result_text = f"Source: {source}\nDestination: {destination}\nIn this code we check the greenest path\nTherfore, the minimal pollution is: {total_pollution}\nThe path is: {decoded_path}"
    result_label.config(text=result_text)

# Create the main window
root = tk.Tk()
root.title("Greenest")

# Create and place the source label and entry
source_label = tk.Label(root, text="Source:")
source_label.grid(row=0, column=0, padx=10, pady=10)

source_entry = tk.Entry(root)
source_entry.grid(row=0, column=1, padx=10, pady=10)

# Create and place the destination label and entry
destination_label = tk.Label(root, text="Destination:")
destination_label.grid(row=1, column=0, padx=10, pady=10)

destination_entry = tk.Entry(root)
destination_entry.grid(row=1, column=1, padx=10, pady=10)

# Create and place the Go button
go_button = tk.Button(root, text="Go", command=on_go_button_clicked)
go_button.grid(row=3, columnspan=2, pady=10)

# Create and place the result label
result_label = tk.Label(root, text="", justify=tk.LEFT)
result_label.grid(row=4, columnspan=2, padx=10, pady=10)

# Run the main event loop
root.mainloop()

Source encoded: 8
Destination encoded: 87
In this code we check the greenest path

Optimal Path (encoded): [(8, 49, 1, 0.012420746), (49, 78, 1, 0.05009999), (78, 87, 1, 0.0004362382)]
Total Pollution: 0.06295697501627728
Optimal Path (decoded): [('Beryllium', 'Neon', 1, 0.012420746), ('Neon', 'Sulfur', 1, 0.05009999), ('Sulfur', 'Titanium', 1, 0.0004362382)]
