In [3]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import re
import os

In [5]:
folder_path = '../data/Traffic_data'
dataframes = {}

for file in os.listdir(folder_path):
    if file.endswith('.traffic.htm'):
        file_path = os.path.join(folder_path, file)
        file_name = file.split('.')[0]
        df_list = pd.read_html(file_path)
        df = df_list[4]
        df = df.iloc[1:].reset_index(drop=True)
        df_key = file_name
        dataframes[df_key] = df
# Create an empty list to store the modified DataFrames
modified_dfs = []

# Iterate through the dictionary and skip the first two rows of each DataFrame
for key, df in dataframes.items():
    modified_df = df.iloc[2:]  # Skip the first two rows
    modified_dfs.append(modified_df)

# Concatenate all the modified DataFrames into one big DataFrame
big_df = pd.concat(modified_dfs, ignore_index=True)
columns = df.iloc[1].tolist()
columns[5] = "LRP_2"
columns[6] = "Offset_2"
columns[7] = "Chainage_2"
columns[0] = 'Road'
columns[1] = 'Name'
big_df.columns = columns
big_df['type'] = 'road'


# Define a function to find all road names in the 'Name' column
def find_roads(name):
    # Regular expression to match the pattern described (roads starting with Z, N, or R followed by numbers)
    road_pattern = re.compile(r'\b[nNzZrR]\d+\b')
    # Find all matches in the name
    found_roads = road_pattern.findall(name)
    return found_roads


# Apply the function to the 'Name' column to create a new column with the list of identified roads
big_df['identified_roads'] = big_df['Name'].apply(find_roads)

bmms = pd.read_excel("../data/BMMS_overview.xlsx")
big_df['base_road'] = big_df['Road'].apply(lambda x: x.split('-')[0])
big_df['Chainage'] = pd.to_numeric(big_df['Chainage'], errors='coerce')
bmms['chainage'] = pd.to_numeric(bmms['chainage'], errors='coerce')

bmms_subset = bmms[['road', 'chainage', 'name', 'condition', 'lat', 'lon']].copy()
bmms_subset.rename(columns={'road': 'base_road', 'chainage': 'Chainage', 'name': 'Name'}, inplace=True)

for col in big_df.columns:
    if col not in bmms_subset.columns:
        bmms_subset[col] = pd.NA
bmms_subset['type'] = 'bridge'

combined_df = pd.concat([big_df, bmms_subset], ignore_index=True)

combined_df = combined_df.sort_values(by=['base_road', 'Chainage'])
combined_df.reset_index(drop=True, inplace=True)
combined_df
roads = pd.read_csv('../../EPA133a-G2-A3/data/_roads3.csv')

for index, row in combined_df.iterrows():
    if pd.isnull(row['lat']):
        chainage = row['Chainage']
        road = row['base_road']
        closest_chainage = 9999
        closest_lat = 99999
        closest_lon = 99999
        closest_row = None
        for index2, row2 in roads[roads['road'] == road].iterrows():
            if abs(chainage - row2['chainage']) < closest_chainage:
                closest_row = row2
                closest_lat = row2['lat']
                closest_lon = row2['lon']
                closest_chainage = abs(chainage - row2['chainage'])
            else:
                combined_df.loc[index, 'lat'] = closest_lat
                combined_df.loc[index, 'lon'] = closest_lon
                break

combined_df.to_csv('../data/traffic_df_with_bridges.csv', index=False)
df = combined_df.copy()
df_filtered = df[~df["Road"].str.contains("L", na=False)]
df_filtered.to_csv('../data/cleaned_df_A4.csv')