In [3]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv('dataset-3.csv')

In [32]:
df.head()

Unnamed: 0,id_start,id_end,distance
0,1001400,1001402,9.7
1,1001402,1001404,20.2
2,1001404,1001406,16.0
3,1001406,1001408,21.7
4,1001408,1001410,11.1


In [40]:
df.shape

(44, 3)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id_start  44 non-null     int64  
 1   id_end    44 non-null     int64  
 2   distance  44 non-null     float64
dtypes: float64(1), int64(2)
memory usage: 1.2 KB


In [44]:
def calculate_distance_matrix(file_path):
    df = pd.read_csv(file_path)
    columns = sorted(set(df['id_start'].unique()) | set(df['id_end'].unique()))
    distance_matrix = pd.DataFrame(index=columns, columns=columns)
    distance_matrix = distance_matrix.fillna(0)
    for index, row in df.iterrows():
        start_id, end_id, distance = row['id_start'], row['id_end'], row['distance']
        distance_matrix.at[start_id, end_id] += distance
        distance_matrix.at[end_id, start_id] += distance
    for column in columns:
        distance_matrix.at[column, column] = 0
    return distance_matrix
file_path = r'C:\Users\win10\submissions\dataset-3.csv'
resulting_matrix = calculate_distance_matrix(file_path)
print(resulting_matrix)

         1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
1001400      0.0      9.7      0.0      0.0      0.0      0.0      0.0   
1001402      9.7      0.0     20.2      0.0      0.0      0.0      0.0   
1001404      0.0     20.2      0.0     16.0      0.0      0.0      0.0   
1001406      0.0      0.0     16.0      0.0     21.7      0.0      0.0   
1001408      0.0      0.0      0.0     21.7      0.0     11.1      0.0   
1001410      0.0      0.0      0.0      0.0     11.1      0.0     15.6   
1001412      0.0      0.0      0.0      0.0      0.0     15.6      0.0   
1001414      0.0      0.0      0.0      0.0      0.0      0.0     18.2   
1001416      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1001418      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1001420      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1001422      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1001424      0.0      0.0      0.0    

In [48]:
def unroll_distance_matrix(distance_matrix):
    # Create an empty DataFrame for unrolled distances
    unrolled_df = pd.DataFrame(columns=['id_start', 'id_end', 'distance'])
    for i in range(len(distance_matrix.index)):
        for j in range(i + 1, len(distance_matrix.columns)):
            id_start = distance_matrix.index[i]
            id_end = distance_matrix.columns[j]
            distance = distance_matrix.at[id_start, id_end]
            unrolled_df = pd.concat([unrolled_df, pd.DataFrame([[id_start, id_end, distance]],
                                                                columns=['id_start', 'id_end', 'distance'])],ignore_index=True)
    return unrolled_df
file_path = r'C:\Users\win10\submissions\dataset-3.csv'
distance_matrix = calculate_distance_matrix(file_path)
unrolled_distances = unroll_distance_matrix(distance_matrix)
print(unrolled_distances)

    id_start   id_end  distance
0    1001400  1001402       9.7
1    1001400  1001404       0.0
2    1001400  1001406       0.0
3    1001400  1001408       0.0
4    1001400  1001410       0.0
..       ...      ...       ...
898  1001488  1004355       0.0
899  1001488  1004356       4.0
900  1004354  1004355       2.0
901  1004354  1004356       2.0
902  1004355  1004356       0.0

[903 rows x 3 columns]


In [52]:
def find_ids_within_ten_percentage_threshold(df, reference_value):
    reference_df = df[df['id_start'] == reference_value]
    if reference_df.empty:
        print(f"No data found for reference value {reference_value}.")
        return []
    average_distance = reference_df['distance'].mean()
    print(f"Reference Value: {reference_value}")
    print(f"Average Distance: {average_distance}")
    lower_bound = average_distance * 0.9
    upper_bound = average_distance * 1.1
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")
    filtered_df = df[(df['distance'] >= lower_bound) & (df['distance'] <= upper_bound)]
    result_ids = sorted(filtered_df['id_start'].unique())
    return result_ids
file_path = 'path/to/dataset-2.csv' 
unrolled_distances = unroll_distance_matrix(distance_matrix)
reference_value = 1001400  
result_ids = find_ids_within_ten_percentage_threshold(unrolled_distances, reference_value)
print(result_ids)

Reference Value: 1001400
Average Distance: 0.23095238095238094
Lower Bound: 0.20785714285714285
Upper Bound: 0.2540476190476191
[]


In [53]:
def calculate_toll_rate(df):
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    for vehicle_type, rate in rate_coefficients.items():
        df[vehicle_type] = df['distance'] * rate
    return df
file_path = 'path/to/dataset-2.csv'  # Use the path to the file generated in Question 2
unrolled_distances = unroll_distance_matrix(distance_matrix)
toll_rates_df = calculate_toll_rate(unrolled_distances)
print(toll_rates_df)

    id_start   id_end  distance  moto    car     rv    bus  truck
0    1001400  1001402       9.7  7.76  11.64  14.55  21.34  34.92
1    1001400  1001404       0.0  0.00   0.00   0.00   0.00   0.00
2    1001400  1001406       0.0  0.00   0.00   0.00   0.00   0.00
3    1001400  1001408       0.0  0.00   0.00   0.00   0.00   0.00
4    1001400  1001410       0.0  0.00   0.00   0.00   0.00   0.00
..       ...      ...       ...   ...    ...    ...    ...    ...
898  1001488  1004355       0.0  0.00   0.00   0.00   0.00   0.00
899  1001488  1004356       4.0  3.20   4.80   6.00   8.80  14.40
900  1004354  1004355       2.0  1.60   2.40   3.00   4.40   7.20
901  1004354  1004356       2.0  1.60   2.40   3.00   4.40   7.20
902  1004355  1004356       0.0  0.00   0.00   0.00   0.00   0.00

[903 rows x 8 columns]


In [62]:
from datetime import time
def calculate_time_based_toll_rates(df):
    time_ranges = [
        (time(0, 0, 0), time(10, 0, 0), 0.8),
        (time(10, 0, 0), time(18, 0, 0), 1.2),
        (time(18, 0, 0), time(23, 59, 59), 0.8)
    ]
    result_df = pd.DataFrame()

    for start_time, end_time, discount_factor in time_ranges:
        df['discount_factor'] = discount_factor
        df['start_day'] = 'Monday'  
        df['start_time'] = start_time
        df['end_day'] = 'Sunday'  
        df['end_time'] = end_time
        result_df = pd.concat([result_df, df], ignore_index=True)
    result_df.loc[result_df['start_day'].isin(['Saturday', 'Sunday']), 'discount_factor'] = 0.7
    return result_df[['id_start', 'id_end', 'distance', 'start_day', 'start_time', 'end_day', 'end_time', 'discount_factor']]
file_path = r'C:\Users\win10\submissions\dataset-3.csv'
time_based_toll_rates_df = calculate_time_based_toll_rates(pd.read_csv(file_path))
print(time_based_toll_rates_df)

     id_start   id_end  distance start_day start_time end_day  end_time  \
0     1001400  1001402       9.7    Monday   00:00:00  Sunday  10:00:00   
1     1001402  1001404      20.2    Monday   00:00:00  Sunday  10:00:00   
2     1001404  1001406      16.0    Monday   00:00:00  Sunday  10:00:00   
3     1001406  1001408      21.7    Monday   00:00:00  Sunday  10:00:00   
4     1001408  1001410      11.1    Monday   00:00:00  Sunday  10:00:00   
..        ...      ...       ...       ...        ...     ...       ...   
127   1001462  1001464      26.7    Monday   18:00:00  Sunday  23:59:59   
128   1001464  1001466       8.5    Monday   18:00:00  Sunday  23:59:59   
129   1001466  1001468      10.7    Monday   18:00:00  Sunday  23:59:59   
130   1001468  1001470      10.6    Monday   18:00:00  Sunday  23:59:59   
131   1001470  1001472      16.0    Monday   18:00:00  Sunday  23:59:59   

     discount_factor  
0                0.8  
1                0.8  
2                0.8  
3      