This notebook creates a csv file in the necessary format.

In [1]:
import pandas as pd
from scipy.spatial import distance

# load raw datafiles into DataFrames
roads = pd.read_csv("../data/raw/_roads3.csv")
bridges = pd.read_excel('../data/raw/BMMS_overview.xlsx', engine='openpyxl')

In [2]:
#--- Identifying side roads of N1 and N2

# Only keep N1 and N2
df = roads.loc[(roads["road"] == "N1") | (roads["road"] == "N2")]

# List of all roads
Roads = roads['road'].unique().tolist()

# List of all crossroads and side roads (in ugly format)
names = df['name'].tolist()

# Find roads that appear in the crossroads / sideroads 
side_roads = [road for road in Roads if any(road in name for name in names)]
print("there are", len(Roads), " roads, and ", len(side_roads), "side roads to N1 and N2")

there are 847  roads, and  103 side roads to N1 and N2


In [3]:
#--- Filtering for side roads over 25km long

# Take all side roads
sideroads_df = roads[roads['road'].isin(side_roads)]

# Filter for those over 25km long (aka LRPE had chainage >25)
sideroads_ends = sideroads_df.loc[(sideroads_df["lrp"] == "LRPE") & (sideroads_df["chainage"] > 25)]
sideroads_tokeep = sideroads_ends['road'].tolist()

# Apply filter
sideroads_df = sideroads_df.loc[(sideroads_df['road'].isin(sideroads_tokeep))]

In [4]:
#--- Adding a length to each road segment

# Create the length column
sideroads_df["length"] = sideroads_df["chainage"]*1000 
sideroads_df = sideroads_df.reset_index(drop=True)

# Change the chainage to a length
for i in range((len(sideroads_df)-1),0,-1):
    if sideroads_df["road"][i] == sideroads_df["road"][i-1]:
        sideroads_df.loc[i,"length"] = sideroads_df.loc[i,"length"]-sideroads_df.loc[i-1,"length"]

In [5]:
#--- Moving on to bridges!

# Filter for relevant columns and relevant roads
bridges_relevant = bridges[["road", "LRPName", "condition", "length", "chainage", "lat", "lon", 'name', 'km','constructionYear']]
bridges_relevant = bridges_relevant.loc[bridges['road'].isin(side_roads)]
bridges_relevant = bridges_relevant.reset_index(drop = True)

bridgestemp = bridges_relevant # useful in a second

# Only keep right side of each bridge
for i in range(1,len(bridges_relevant)):
    if len(str(bridges_relevant["name"][i])) > 4:
        if bridges_relevant["name"][i][-2:] == 'L)' or bridges_relevant["name"][i][-4:] == 'eft)' or bridges_relevant["name"][i][-3:] == 'L )' or bridges_relevant["name"][i][-4:] == 'EFT)':
            bridgestemp = bridgestemp.drop(i,axis = 0)

# Delete depulicates by removing older information
#  assumption: no 2 lrps in the same road have the exact same km
bridges_relevant = bridgestemp \
    .sort_values(by=['road','km','constructionYear'], ascending=False) \
    .drop_duplicates(subset=['road', 'km'], keep='first')


In [6]:
#--- Bringing relevant roads and bridges together 

# Prepare merge
bridges_relevant = bridges_relevant.rename(columns={"LRPName": "lrp"})

# Merge
merged = pd.merge(sideroads_df, bridges_relevant, how="outer", on=["road", "lrp"])
merged = merged.reset_index(drop = True)

# Add model_type
merged["model_type"] = merged["lrp"].apply(lambda x: "sourcesink" if x == "LRPS" else ("sourcesink" if x == "LRPE" else "link"))
merged.loc[merged["condition"].notnull(), "model_type"] = "bridge"

# Fill in missing data
merged["chainage_x"] = merged["chainage_x"].fillna(value=merged["chainage_y"])
merged["lat_x"] = merged["lat_x"].fillna(value=merged["lat_y"])
merged["lon_x"] = merged["lon_x"].fillna(value=merged["lon_y"])
merged["name_y"] = merged["name_y"].fillna(value=merged["name_x"])
merged["length_x"] = merged["length_x"].fillna(value=merged["length_y"])

# Keep and rename useful columns only
merged = merged.sort_values(by=['road','chainage_x'],ascending = True)
col_tokeep = ["road", "model_type", "lrp", "name_y", "lat_x", "lon_x", "length_x", "condition","type"]
merged = merged[col_tokeep]
merged = merged.rename(columns={"name_y": "name","lat_x" : "lat", "lon_x" : "lon", "length_x" : "length"})
merged = merged.reset_index(drop = True)

# Add ids
merged["id"] =  range(1000000, len(merged) +1000000 )

In [7]:
#--- Defining the intersections

# subset the dataframe to only the rows that indicate an intersection
cross_sideroads = merged.loc[(merged['type'].str.contains("CrossRoad"))| (merged['type'].str.contains("SideRoad"))]
cross_sideroads = cross_sideroads.reset_index()
cross_sideroads

roads_done = []
# iterate over all road rows that are crossroads or sideroads
for i,crossrow in cross_sideroads.iterrows():
    # check if any of those roads indicate in their name that they intersect with a road that crosses the N1 and/or N2
    for j in sideroads_tokeep:
        if j in str(crossrow["name"]):
            # if you are not trying one intersection that you already did the other way around
            if i not in roads_done:
                found = False
                # iterate over all points in that intersecting road to see if any of those indicate to be an intersection with the road at hand
                # since a road can cross another road on multiple occasions, we only look as far as 1 lat and 1 lon
                for k,roadrow in merged[merged['road'] == j].iterrows():
                    if crossrow["name"] in roadrow["name"] and (roadrow["lat"] - crossrow["lat"] < 1) and (roadrow["lon"] - crossrow["lon"] < 1) and crossrow["road"] != roadrow["road"]:
                        # if this finds the intersection point, make its id, lat and lon the same and make the model_type "intersection"
                        merged.iloc[k,9] = cross_sideroads.iloc[i,10]
                        merged.iloc[k,4] = cross_sideroads.iloc[i,5]
                        merged.iloc[k,5] = cross_sideroads.iloc[i,6]
                        merged.iloc[k,1] = "intersection"
                        # save the fact that you have handled this intersection
                        found = True
                        roads_done.append(i)
                # if you didnt find the intersecting road through the name, assign the closest road point as the intersection
                # assumption: projection is so small that it will not distort the distance to much so we can use euclidean distance to determine the closest point
                if found == False:
                    closestrow = merged.iloc[[0]]
                    closestindex = 0
                    closestdistance = 1000
                    p1 = (crossrow["lat"],crossrow["lon"])
                    # calculate for each road point the distance to the intersection, save it if it is closer than what you found before
                    for k,roadrow in merged[merged['road'] == j].iterrows():
                        p2 = (roadrow["lat"],roadrow["lon"])
                        if distance.euclidean(p1, p2) < closestdistance:
                            closestrow = roadrow
                            closestindex = k
                            closestdistance = distance.euclidean(p1, p2)
                    # of the closest point, make the id, lat and lon the same and make the model_type "intersection"
                    merged.iloc[closestindex,9] = cross_sideroads.iloc[i,10]
                    merged.iloc[closestindex,4] = cross_sideroads.iloc[i,5]
                    merged.iloc[closestindex,5] = cross_sideroads.iloc[i,6]
                    merged.iloc[closestindex,1] = "intersection"
                    # save the fact that you have handled this intersection
                    roads_done.append(i)         

In [8]:
# show it worked
merged.iloc[690:700,:]

Unnamed: 0,road,model_type,lrp,name,lat,lon,length,condition,type,id
690,N1,link,LRP259b,Road to Kaliganj (Z1070),22.309694,91.91575,424.0,,CrossRoad,1000690
691,N1,link,LRP260,Box culvert,22.311417,91.917638,273.0,,KmPost,1000691
692,N1,bridge,LRP260a,PALYA BUS STATION,22.312472,91.918833,168.0,A,Culvert,1000692
693,N1,link,LRP260b,Box culvert,22.315639,91.922499,521.0,,Culvert,1000693
694,N1,intersection,LRP260c,Road to Kalurghat (N107),22.374638,91.906278,121.0,,"SideRoad,Left",1006716
695,N1,bridge,LRP252c,UTTOR SONAPUR,22.315747,91.924352,3.0,A,,1000695
696,N1,link,LRP261,Km post missing,22.315556,91.925388,190.0,,KmPost,1000696
697,N1,bridge,LRP261a,SRIMI BRIDGE,22.313778,91.932222,750.0,C,Culvert,1000697
698,N1,link,LRP262,Km post missing,22.31375,91.934527,250.0,,KmPost,1000698
699,N1,link,LRP263,Km post missing,22.313639,91.943889,1000.0,,KmPost,1000699


In [9]:
#--- Adapting to the newest csv guidelines

# Adapting names
#move bridge names to a new column 
merged["bridge_name"] = merged["name"].loc[merged['model_type'] == "bridge"]
#delete names for everything and replace that of SourceSinks according to convention
i = 1 # useful in a second
for index, row in merged.iterrows():
    if not row['model_type'] == "sourcesink":
        merged["name"][index] = ""
        
    elif row['model_type'] == "sourcesink":
        merged["name"][index] = "SoSi" + str(i)
        merged["condition"][index] = ""
        i += 1

# Put columns in right order
merged = merged[["road", "id", "model_type", "condition", "name", "lat", "lon", "length", "bridge_name"]]
merged.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["name"][index] = "SoSi" + str(i)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["condition"][index] = ""
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["name"][index] = ""


Unnamed: 0,road,id,model_type,condition,name,lat,lon,length,bridge_name
0,N1,1000000,sourcesink,,SoSi1,23.706028,90.443333,0.0,
1,N1,1000001,link,,,23.702917,90.450417,814.0,
2,N1,1000002,link,,,23.702778,90.450472,8.0,
3,N1,1000003,link,,,23.702139,90.451972,178.0,
4,N1,1000004,bridge,A,,23.698739,90.458861,11.3,.


In [10]:
# Save to csv 
merged.to_csv("processed/allroads.csv", index=None)