In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from datetime import datetime

## Q7

### loading the data and suit it to meet the requirements in the question

In [2]:
df = pd.read_csv('recorded times.csv')
test_df = pd.read_csv('Test.csv')

In [3]:
# Group by 'Start', convert 'End' to set to remove duplicates, then convert to list
# Creating a map object where key is a name of a location and value is a list of locations accessible from that key directly
walk_trips = df.drop(df[df.Type == 'Car'].index ,axis=0).groupby('Start')['End'].apply(lambda x: list(set(x))).to_dict()
car_trips = df.drop(df[df.Type == 'Pedestrian'].index ,axis=0).groupby('Start')['End'].apply(lambda x: list(set(x))).to_dict()

#### function that takes a date string and converts it to unix time integer

In [4]:
def convert_to_unix(date_str, mode):
    str_date = re.split(r'[/ :]', date_str)
    int_date = [np.int64(numeric_string) for numeric_string in str_date]
    
    # The order is: month, day, year, hour, minutes in test
    # Order in train is day, month, year, hour, minutes
    if mode == 'train':
        day, month, year, hour, minutes = int_date
    elif mode == 'test':
        month, day, year, hour, minutes = int_date
    dt = datetime(year, month, day, hour, minutes)
    
    unix_time = np.int64(time.mktime(dt.timetuple()))
    return unix_time

#### feature engineering, making sure everything is numerical

In [5]:
# turn date collumn to unix time (seconds)
df['Date'] = df['Date'].apply(lambda x: convert_to_unix(x, 'train'))
test_df['Date'] = test_df['Date'].apply(lambda x: convert_to_unix(x, 'test'))

# create a set (no duplicates) of trips and types (pedestrian/car)
unique_Type = set(df['Type'])
unique_locations = set(df['Start']).union(df['End'])

# Create a mapping from locations/type names to numbers
Type_to_numbers_map = {Type: j for j, Type in enumerate(unique_Type)}
locations_to_number_map = {location: i for i, location in enumerate(unique_locations)}

# Replace city/type names with numbers in the DataFrame
df['Start'] = df['Start'].map(locations_to_number_map)
df['End'] = df['End'].map(locations_to_number_map)
df['Type'] = df['Type'].map(Type_to_numbers_map)

t = df['Travel time'].to_numpy()
t = t.reshape(len(t), 1)
X = df.drop('Travel time', axis=1)

# get only pedestrians
t_pedest = t[np.where(df['Type'] == 1)]
X_pedest = X.drop(X[X.Type == 0].index, axis=0).drop('Type', axis=1).to_numpy()

# get only cars
t_cars = t[np.where(df['Type'] == 0)]
X_cars = X.drop(X[X.Type == 1].index, axis=0).drop('Type', axis=1).to_numpy()

# car - 0 , pedestrian - 1

### create helper functions to train the regressor

#### create X matrix according the polynomial degree

In [6]:
def create_X_matrix(X, d):
    X_matrix = np.ones((X.shape[0], 1))
    for degree in range(1, d + 1):
      X_matrix = np.hstack((X_matrix, X**degree))
    return X_matrix

#### train the regressor and return the coefficients

In [7]:
def calc_w(X_matrix, t):
    return np.linalg.inv(X_matrix.T@X_matrix)@X_matrix.T@t

#### approximate and return t

In [8]:
def approx_t(X_matrix, w): 
    t_approx = np.zeros((len(X_matrix), 1))
    for sample in range(len(X_matrix)):
        t_approx[sample] += w.T@X_matrix[sample]
    return t_approx

In [9]:
def approx_t_single(X_matrix, w):
    t_approx = w.T@X_matrix
    return t_approx[0]

#### calculate and return MSE for a set of samples

In [10]:
def calc_MSE(t_real, t_approx):
    return (np.sum((t_real - t_approx)**2))/len(t_real)

### train the regressor for pedestrians

In [17]:
X_mat = create_X_matrix(X_pedest, 2)
w = calc_w(X_mat, t_pedest)

### find and print best route for the test data for pedestrians

#### using DFS search to find all viable routes and saving the best one and time it took to reach the destination

In [18]:
def find_best_route(graph, start_loc, end_loc, start_date, path=[], current_t=0):
    global min_t
    global best_path
    path = path + [start_loc]
    
    if start_loc == end_loc:
        min_t = current_t
        best_path = path
        return
    
    if start_loc not in graph:
        min_t = np.inf
        best_path = ["Invalid starting location"]
        return
    
    for node in graph[start_loc]:
        if node not in path:
            start_map = locations_to_number_map[start_loc]
            end_map = locations_to_number_map[node]
            t_approximated = approx_t_single(create_X_matrix(np.array([[start_date, start_map, end_map]]), 2)[0], w)
            next_t = current_t + t_approximated
            if next_t >= min_t:
                continue
            new_routes = find_best_route(graph, node, end_loc, start_date+t_approximated, path, next_t)
    return

#### loop through the test data and print the results for pedestrians

In [19]:
print("Fastest route and approximate time for pedestrians for test data\n")
for i in range(len(test_df)):
    min_t = np.inf
    best_path = []
    find_best_route(walk_trips, test_df['Origin'][i], test_df['Destination'][i], test_df['Date'][i])
    print(f"Origin: {test_df['Origin'][i]}, Destination: {test_df['Destination'][i]}")
    print(f"The best route is: {best_path}")
    print(f"Approximated travel time for route: {min_t} seconds\n")

Fastest route and approximate time for pedestrians for test data

Origin: Lanthanum, Destination: Barium
The best route is: ['Lanthanum', 'Barium']
Approximated travel time for route: 407.56931984133644 seconds

Origin: Lutetium, Destination: Niobium
The best route is: ['Lutetium', 'Terbium', 'Praseodymium', 'Iodine', 'Silver', 'Niobium']
Approximated travel time for route: 2078.0818833627777 seconds

Origin: Cadmium, Destination: Thallium
The best route is: ['Cadmium', 'Silver', 'Iodine', 'Praseodymium', 'Terbium', 'Gadolinium', 'Ytterbium', 'Osmium', 'Lead', 'Thallium']
Approximated travel time for route: 3788.793293915408 seconds



## Q8

### retrain the regressor for cars

In [20]:
X_mat = create_X_matrix(X_cars, 2)
w = calc_w(X_mat, t_cars)

### print results for people traveling by car

In [21]:
print("Fastest route and approximate time for cars for test data\n")
for i in range(len(test_df)):
    min_t = np.inf
    best_path = []
    find_best_route(car_trips, test_df['Origin'][i], test_df['Destination'][i], test_df['Date'][i])
    print(f"Origin: {test_df['Origin'][i]}, Destination: {test_df['Destination'][i]}")
    print(f"The best route is: {best_path}")
    print(f"Approximated travel time for route: {min_t} seconds\n")

Fastest route and approximate time for cars for test data

Origin: Lanthanum, Destination: Barium
The best route is: ['Lanthanum', 'Barium']
Approximated travel time for route: 196.90567699567802 seconds

Origin: Lutetium, Destination: Niobium
The best route is: ['Lutetium', 'Terbium', 'Praseodymium', 'Iodine', 'Silver', 'Niobium']
Approximated travel time for route: 1013.1580943819591 seconds

Origin: Cadmium, Destination: Thallium
The best route is: ['Cadmium', 'Silver', 'Iodine', 'Praseodymium', 'Terbium', 'Lutetium', 'Ytterbium', 'Osmium', 'Lead', 'Thallium']
Approximated travel time for route: 1853.5007190812335 seconds

