In [1]:
import fnmatch # this will be used to identify all the traffic htm files
import os
import pandas as pd
import time


start_time = time.time()

In [2]:
# General setting

directory_traffic_files = '../data/traffic files' # directory of the htm files with traffic data
pd.options.display.float_format = "{:,.2f}".format # float data with 2 decimal points
appended_data_traffic = pd.DataFrame() # creating empty dataframe to append data to


In [3]:
# Getting all traffic files from traffic files directory

traffic_files_list = []
pattern = '*.traffic.htm' # desired string to be found in files
for file in os.listdir(directory_traffic_files):
    if fnmatch.fnmatch(file, pattern):
        traffic_files_list.append(file)

Checking the files we have on the directory

In [4]:
# checking how many traffic files there are on directory 
len(traffic_files_list)

8

In [5]:
# checking file names
traffic_files_list

['N1.traffic.htm',
 'N102.traffic.htm',
 'N104.traffic.htm',
 'N105.traffic.htm',
 'N2.traffic.htm',
 'N204.traffic.htm',
 'N207.traffic.htm',
 'N208.traffic.htm']

Going through every file in the directory to collect the traffic data and append to a common dataframe

In [6]:
# final names for each column

column_names = (['Link no','Name','Start location-LRP','Start location-Offset','Start location-Chainage','End location-LRP',\
    'End location-Offset','End location-Chainage','Length-(Km)','Traffic Data-Heavy Truck','Traffic Data-Medium Truck',\
    'Traffic Data-Small Truck','Traffic Data-Large Bus','Traffic Data-Medium Bus','Traffic Data-Micro Bus','Traffic Data-Utility',\
    'Traffic Data-Car','Traffic Data-Auto Rickshaw','Traffic Data-Motor Cycle','Traffic Data-Bi-Cycle','Traffic Data-Cycle Rickshaw',\
    'Traffic Data-Cart','Total-Motorized','Total-Non Motorized','Total-Total AADT','Traffic-(AADT)','Road'])


# going through each file on the traffic files directory

for elem in traffic_files_list:
    file_path = os.path.join(directory_traffic_files,elem)
    
    with open(file_path, 'r') as f:
        dfs = pd.read_html(f.read())
    traffic_data = dfs[4] # selecting the table that we are interested in from each html file

    # The table comes with unnecessary information that we have to correct or drop
    
    traffic_data=traffic_data.iloc[2: , :] # dropping unnecessary columns
    traffic_data = traffic_data.rename(columns=traffic_data.iloc[0]) # renaming columns from row data
    traffic_data = traffic_data.iloc[1: , :] # dropping unnecessary columns
    traffic_data.columns.values[0] = "Link no" # renaming 1st column
    traffic_data.columns.values[1] = "Name" # renaming 2nd column
    traffic_data["Road"] = traffic_data["Link no"].str.split('-').str[0] # adding information about the road name in a specific column
    traffic_data.set_axis(column_names, axis='columns',inplace=True)

    appended_data_traffic = appended_data_traffic.append(traffic_data,ignore_index = True)
    

In [7]:
# checking structure of the dataframe

appended_data_traffic.head()

Unnamed: 0,Link no,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,...,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT),Road
0,N1-1L,Jatrabari - Int.with Z1101 (Left) (Left),LRPS,0,0.0,LRPS,822,0.822,0.822,402.0,...,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0,N1
1,N1-1R,Jatrabari - Int.with Z1101 (Left) (Right),LRPS,0,0.0,LRPS,822,0.822,0.822,660.0,...,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0,N1
2,N1-2L,Int.with Z1101 - Signboard (Left) R111 (Left),LRPS,822,0.822,LRPS,4175,4.175,3.353,660.0,...,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0,N1
3,N1-2R,Int.with Z1101 - Signboard (Left) R111 (Right),LRPS,822,0.822,LRPS,4175,4.175,3.353,402.0,...,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0,N1
4,N1-3L,Signboard - Shimrail (Left)R110 (Left),LRPS,4175,4.175,LRPS,7181,7.181,3.006,91.0,...,2266.0,1087.0,75.0,1198.0,0.0,16288.0,1273.0,17561.0,17561.0,N1


In [8]:
# checking structure of the dataframe

appended_data_traffic.tail()

Unnamed: 0,Link no,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,...,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT),Road
168,N208-1,Moulvibazar-Int.with Z2002,LRPS,0,0.0,LRPS,3190,3.19,3.19,15.0,...,3194.0,1123.0,147.0,486.0,0.0,7514.0,633.0,8147.0,8147.0,N208
169,N208-2,Int.with Z2002-Rajnagar (Int.with R281),LRPS,3190,3.19,LRP012,1661,13.461,10.271,15.0,...,3194.0,1123.0,147.0,486.0,0.0,7514.0,633.0,8147.0,8147.0,N208
170,N208-3,Rajnagar(Int.with R281)-Daudabad (Int.with Z2832),LRP012,1661,13.461,LRP052,385,51.05,37.589,0.0,...,2233.0,582.0,88.0,73.0,0.0,5328.0,161.0,5489.0,5489.0,N208
171,N208-4,Daudabad (Int.with Z2832)-Royal City Chottor(int.with N210),LRP052,385,51.05,LRP054,677,53.342,2.292,0.0,...,2233.0,582.0,88.0,73.0,0.0,5328.0,161.0,5489.0,5489.0,N208
172,N208-5,Royal City Chottor(int.with N210)-Int.with N2,LRP054,677,53.342,LRP058,900,57.495,4.153,39.0,...,2974.0,1231.0,695.0,822.0,47.0,9488.0,1564.0,11052.0,11052.0,N208


In [9]:
# checking size of the dataframe

appended_data_traffic.shape

(173, 27)

In [10]:
# checking data types of dataframe

appended_data_traffic.dtypes

Link no                        object
Name                           object
Start location-LRP             object
Start location-Offset          object
Start location-Chainage        object
End location-LRP               object
End location-Offset            object
End location-Chainage          object
Length-(Km)                    object
Traffic Data-Heavy Truck       object
Traffic Data-Medium Truck      object
Traffic Data-Small Truck       object
Traffic Data-Large Bus         object
Traffic Data-Medium Bus        object
Traffic Data-Micro Bus         object
Traffic Data-Utility           object
Traffic Data-Car               object
Traffic Data-Auto Rickshaw     object
Traffic Data-Motor Cycle       object
Traffic Data-Bi-Cycle          object
Traffic Data-Cycle Rickshaw    object
Traffic Data-Cart              object
Total-Motorized                object
Total-Non Motorized            object
Total-Total AADT               object
Traffic-(AADT)                 object
Road        

From the above code we can see that many columns should be numeric, but are as objects, so we need to change the data types of all of them.

In [11]:
# creating function to change data type of columns

def change_data_type (dataframe,column):
    dataframe[column] = pd.to_numeric(dataframe[column])
    
    return(dataframe)


In [12]:
# Changing data types of clumns in final dataframe

numerical_cols = ['Start location-Offset','Start location-Chainage','End location-Offset','End location-Chainage','Length-(Km)',\
    'Traffic Data-Heavy Truck','Traffic Data-Medium Truck','Traffic Data-Small Truck','Traffic Data-Large Bus','Traffic Data-Medium Bus',\
    'Traffic Data-Micro Bus','Traffic Data-Utility','Traffic Data-Car','Traffic Data-Auto Rickshaw','Traffic Data-Motor Cycle',\
    'Traffic Data-Bi-Cycle','Traffic Data-Cycle Rickshaw','Traffic Data-Cart','Total-Motorized','Total-Non Motorized','Total-Total AADT','Traffic-(AADT)']

for elem in numerical_cols:
    appended_data_traffic = change_data_type(appended_data_traffic,elem)

In [13]:
# checking data types of dataframe

appended_data_traffic.dtypes

Link no                         object
Name                            object
Start location-LRP              object
Start location-Offset            int64
Start location-Chainage        float64
End location-LRP                object
End location-Offset              int64
End location-Chainage          float64
Length-(Km)                    float64
Traffic Data-Heavy Truck       float64
Traffic Data-Medium Truck      float64
Traffic Data-Small Truck       float64
Traffic Data-Large Bus         float64
Traffic Data-Medium Bus        float64
Traffic Data-Micro Bus         float64
Traffic Data-Utility           float64
Traffic Data-Car               float64
Traffic Data-Auto Rickshaw     float64
Traffic Data-Motor Cycle       float64
Traffic Data-Bi-Cycle          float64
Traffic Data-Cycle Rickshaw    float64
Traffic Data-Cart              float64
Total-Motorized                float64
Total-Non Motorized            float64
Total-Total AADT               float64
Traffic-(AADT)           

In [14]:
# adding a column to be the reference of the segment (start location-LRP - start location-LRP)

appended_data_traffic['Segment'] = appended_data_traffic[['Start location-LRP', 'End location-LRP']].agg('-'.join, axis='columns')


# rearranging columns order

cols = appended_data_traffic.columns.tolist()
cols.insert(1, cols.pop(cols.index('Road'))) # bringing the created Road column to the front so it's easier to be identified
cols.insert(2, cols.pop(cols.index('Segment'))) # bringing the created Segment column to the front so it's easier to be identified
appended_data_traffic = appended_data_traffic.reindex(columns= cols) #reordering columns

appended_data_traffic.head(10)

Unnamed: 0,Link no,Road,Segment,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,...,Traffic Data-Car,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT)
0,N1-1L,N1,LRPS-LRPS,Jatrabari - Int.with Z1101 (Left) (Left),LRPS,0,0.0,LRPS,822,0.82,...,1851.0,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0
1,N1-1R,N1,LRPS-LRPS,Jatrabari - Int.with Z1101 (Left) (Right),LRPS,0,0.0,LRPS,822,0.82,...,2608.0,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0
2,N1-2L,N1,LRPS-LRPS,Int.with Z1101 - Signboard (Left) R111 (Left),LRPS,822,0.82,LRPS,4175,4.17,...,2608.0,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0
3,N1-2R,N1,LRPS-LRPS,Int.with Z1101 - Signboard (Left) R111 (Right),LRPS,822,0.82,LRPS,4175,4.17,...,1851.0,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0
4,N1-3L,N1,LRPS-LRPS,Signboard - Shimrail (Left)R110 (Left),LRPS,4175,4.17,LRPS,7181,7.18,...,1690.0,2266.0,1087.0,75.0,1198.0,0.0,16288.0,1273.0,17561.0,17561.0
5,N1-3R,N1,LRPS-LRPS,Signboard - Shimrail (Left)R110 (Right),LRPS,4175,4.17,LRPS,7181,7.18,...,1609.0,2157.0,1035.0,72.0,1140.0,0.0,15445.0,1212.0,16657.0,16657.0
6,N1-4L,N1,LRPS-LRP009,Shimrail - Katchpur (Left)N2 (Left),LRPS,7181,7.18,LRP009,260,8.76,...,1579.0,3154.0,1162.0,211.0,1077.0,0.0,16001.0,1288.0,17289.0,17289.0
7,N1-4R,N1,LRPS-LRP009,Shimrail - Katchpur (Left)N2 (Right),LRPS,7181,7.18,LRP009,260,8.76,...,1948.0,2693.0,1143.0,517.0,1531.0,0.0,15996.0,2048.0,18044.0,18044.0
8,N1-5L,N1,LRP009-LRP012,Katchpur - Madanpur (Left)N105 (Left),LRP009,260,8.76,LRP012,439,11.94,...,1579.0,3154.0,1154.0,211.0,1077.0,0.0,22591.0,1288.0,23879.0,23879.0
9,N1-5R,N1,LRP009-LRP012,Katchpur - Madanpur (Left)N105 (Right),LRP009,260,8.76,LRP012,439,11.94,...,2105.0,3080.0,1143.0,517.0,1531.0,0.0,22541.0,2048.0,24589.0,24589.0


In [15]:
print("Process finished --- %s seconds ---" % (time.time() - start_time))

Process finished --- 6.5789690017700195 seconds ---
