In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv(r"C:\tt\dataset-3.csv")

In [3]:
df

Unnamed: 0,id_start,id_end,distance
0,1001400,1001402,9.7
1,1001402,1001404,20.2
2,1001404,1001406,16.0
3,1001406,1001408,21.7
4,1001408,1001410,11.1
5,1001410,1001412,15.6
6,1001412,1001414,18.2
7,1001414,1001416,13.2
8,1001416,1001418,13.6
9,1001418,1001420,12.9


# Question 1: Distance Matrix Calculation

In [4]:
import pandas as pd
import networkx as nx

def calculate_distance_matrix(df):
    # Create a graph using networkx
    G = nx.Graph()

    for _, row in df.iterrows():
        edge_attrs = {'distance': row['distance']}
        G.add_edge(row['id_start'], row['id_end'], **edge_attrs)

    distance_matrix = nx.floyd_warshall_numpy(G, weight='distance')

    distance_matrix = (distance_matrix + distance_matrix.T) / 2

    for i in range(len(distance_matrix)):
        distance_matrix[i, i] = 0

    result_df = pd.DataFrame(distance_matrix, index=G.nodes, columns=G.nodes)

    return result_df

In [5]:
calculate_distance_matrix(df)

Unnamed: 0,1001400.0,1001402.0,1001404.0,1001406.0,1001408.0,1001410.0,1001412.0,1001414.0,1001416.0,1001418.0,...,1001456.0,1001458.0,1001460.0,1001461.0,1001462.0,1001464.0,1001466.0,1001468.0,1001470.0,1001472.0
1001400.0,0.0,9.7,29.9,45.9,67.6,78.7,94.3,112.5,125.7,139.3,...,339.9,348.8,353.9,366.7,371.8,398.5,407.0,417.7,428.3,444.3
1001402.0,9.7,0.0,20.2,36.2,57.9,69.0,84.6,102.8,116.0,129.6,...,330.2,339.1,344.2,357.0,362.1,388.8,397.3,408.0,418.6,434.6
1001404.0,29.9,20.2,0.0,16.0,37.7,48.8,64.4,82.6,95.8,109.4,...,310.0,318.9,324.0,336.8,341.9,368.6,377.1,387.8,398.4,414.4
1001406.0,45.9,36.2,16.0,0.0,21.7,32.8,48.4,66.6,79.8,93.4,...,294.0,302.9,308.0,320.8,325.9,352.6,361.1,371.8,382.4,398.4
1001408.0,67.6,57.9,37.7,21.7,0.0,11.1,26.7,44.9,58.1,71.7,...,272.3,281.2,286.3,299.1,304.2,330.9,339.4,350.1,360.7,376.7
1001410.0,78.7,69.0,48.8,32.8,11.1,0.0,15.6,33.8,47.0,60.6,...,261.2,270.1,275.2,288.0,293.1,319.8,328.3,339.0,349.6,365.6
1001412.0,94.3,84.6,64.4,48.4,26.7,15.6,0.0,18.2,31.4,45.0,...,245.6,254.5,259.6,272.4,277.5,304.2,312.7,323.4,334.0,350.0
1001414.0,112.5,102.8,82.6,66.6,44.9,33.8,18.2,0.0,13.2,26.8,...,227.4,236.3,241.4,254.2,259.3,286.0,294.5,305.2,315.8,331.8
1001416.0,125.7,116.0,95.8,79.8,58.1,47.0,31.4,13.2,0.0,13.6,...,214.2,223.1,228.2,241.0,246.1,272.8,281.3,292.0,302.6,318.6
1001418.0,139.3,129.6,109.4,93.4,71.7,60.6,45.0,26.8,13.6,0.0,...,200.6,209.5,214.6,227.4,232.5,259.2,267.7,278.4,289.0,305.0


# Question 2: Unroll Distance Matrix

In [6]:
import pandas as pd

def unroll_distance_matrix(distance_matrix_df):
    unrolled_df = distance_matrix_df.stack().reset_index()

    unrolled_df.columns = ['id_start', 'id_end', 'distance']

    unrolled_df = unrolled_df[unrolled_df['id_start'] != unrolled_df['id_end']]

    unrolled_df.reset_index(drop=True, inplace=True)

    return unrolled_df

In [7]:
distance_matrix_df = calculate_distance_matrix(df)

In [8]:
unrolled_result = unroll_distance_matrix(distance_matrix_df)

In [9]:
unrolled_result

Unnamed: 0,id_start,id_end,distance
0,1001400.0,1001402.0,9.7
1,1001400.0,1001404.0,29.9
2,1001400.0,1001406.0,45.9
3,1001400.0,1001408.0,67.6
4,1001400.0,1001410.0,78.7
...,...,...,...
1801,1001472.0,1001462.0,72.5
1802,1001472.0,1001464.0,45.8
1803,1001472.0,1001466.0,37.3
1804,1001472.0,1001468.0,26.6


# Question 3: Finding IDs within Percentage Threshold

In [10]:
import pandas as pd

def find_ids_within_ten_percentage_threshold(df, reference_value):
    # Filter the DataFrame for the given reference value
    reference_df = df[df['id_start'] == reference_value]

    # Calculate the average distance for the reference value
    avg_distance = reference_df['distance'].mean()

    # Calculate the lower and upper bounds of the threshold
    lower_bound = avg_distance * 0.9
    upper_bound = avg_distance * 1.1

    # Filter values within the 10% threshold
    within_threshold = df[(df['id_start'] != reference_value) & (df['distance'] >= lower_bound) & (df['distance'] <= upper_bound)]

    # Get the unique values from the 'id_start' column and sort them
    result_list = sorted(within_threshold['id_start'].unique())

    return result_list


In [11]:
distance_matrix_df

Unnamed: 0,1001400.0,1001402.0,1001404.0,1001406.0,1001408.0,1001410.0,1001412.0,1001414.0,1001416.0,1001418.0,...,1001456.0,1001458.0,1001460.0,1001461.0,1001462.0,1001464.0,1001466.0,1001468.0,1001470.0,1001472.0
1001400.0,0.0,9.7,29.9,45.9,67.6,78.7,94.3,112.5,125.7,139.3,...,339.9,348.8,353.9,366.7,371.8,398.5,407.0,417.7,428.3,444.3
1001402.0,9.7,0.0,20.2,36.2,57.9,69.0,84.6,102.8,116.0,129.6,...,330.2,339.1,344.2,357.0,362.1,388.8,397.3,408.0,418.6,434.6
1001404.0,29.9,20.2,0.0,16.0,37.7,48.8,64.4,82.6,95.8,109.4,...,310.0,318.9,324.0,336.8,341.9,368.6,377.1,387.8,398.4,414.4
1001406.0,45.9,36.2,16.0,0.0,21.7,32.8,48.4,66.6,79.8,93.4,...,294.0,302.9,308.0,320.8,325.9,352.6,361.1,371.8,382.4,398.4
1001408.0,67.6,57.9,37.7,21.7,0.0,11.1,26.7,44.9,58.1,71.7,...,272.3,281.2,286.3,299.1,304.2,330.9,339.4,350.1,360.7,376.7
1001410.0,78.7,69.0,48.8,32.8,11.1,0.0,15.6,33.8,47.0,60.6,...,261.2,270.1,275.2,288.0,293.1,319.8,328.3,339.0,349.6,365.6
1001412.0,94.3,84.6,64.4,48.4,26.7,15.6,0.0,18.2,31.4,45.0,...,245.6,254.5,259.6,272.4,277.5,304.2,312.7,323.4,334.0,350.0
1001414.0,112.5,102.8,82.6,66.6,44.9,33.8,18.2,0.0,13.2,26.8,...,227.4,236.3,241.4,254.2,259.3,286.0,294.5,305.2,315.8,331.8
1001416.0,125.7,116.0,95.8,79.8,58.1,47.0,31.4,13.2,0.0,13.6,...,214.2,223.1,228.2,241.0,246.1,272.8,281.3,292.0,302.6,318.6
1001418.0,139.3,129.6,109.4,93.4,71.7,60.6,45.0,26.8,13.6,0.0,...,200.6,209.5,214.6,227.4,232.5,259.2,267.7,278.4,289.0,305.0


In [12]:
reference_value = 12

In [13]:
result = find_ids_within_ten_percentage_threshold(unrolled_result, reference_value)