In [1]:
import fnmatch # this will be used to identify all the traffic htm files
import os
import pandas as pd
import time


start_time = time.time()

In [2]:
# General setting

directory_traffic_files = '../data/traffic files' # directory of the htm files with traffic data
pd.options.display.float_format = "{:,.2f}".format # float data with 2 decimal points
appended_data_traffic = pd.DataFrame() # creating empty dataframe to append data to


In [3]:
# Getting all traffic files from traffic files directory

traffic_files_list = []
pattern = '*.traffic.htm' # desired string to be found in files
for file in os.listdir(directory_traffic_files):
    if fnmatch.fnmatch(file, pattern):
        traffic_files_list.append(file)

Checking the files we have on the directory

In [4]:
# checking how many traffic files there are on directory 
len(traffic_files_list)

8

In [5]:
# checking file names
traffic_files_list

['N1.traffic.htm',
 'N102.traffic.htm',
 'N104.traffic.htm',
 'N105.traffic.htm',
 'N2.traffic.htm',
 'N204.traffic.htm',
 'N207.traffic.htm',
 'N208.traffic.htm']

Going through every file in the directory to collect the traffic data and append to a common dataframe

In [6]:
# final names for each column

column_names = (['Link no','Name','Start location-LRP','Start location-Offset','Start location-Chainage','End location-LRP',\
    'End location-Offset','End location-Chainage','Length-(Km)','Traffic Data-Heavy Truck','Traffic Data-Medium Truck',\
    'Traffic Data-Small Truck','Traffic Data-Large Bus','Traffic Data-Medium Bus','Traffic Data-Micro Bus','Traffic Data-Utility',\
    'Traffic Data-Car','Traffic Data-Auto Rickshaw','Traffic Data-Motor Cycle','Traffic Data-Bi-Cycle','Traffic Data-Cycle Rickshaw',\
    'Traffic Data-Cart','Total-Motorized','Total-Non Motorized','Total-Total AADT','Traffic-(AADT)','Road'])


# going through each file on the traffic files directory

for elem in traffic_files_list:
    file_path = os.path.join(directory_traffic_files,elem)
    
    with open(file_path, 'r') as f:
        dfs = pd.read_html(f.read())
    traffic_data = dfs[4] # selecting the table that we are interested in from each html file

    # The table comes with unnecessary information that we have to correct or drop
    
    traffic_data=traffic_data.iloc[2: , :] # dropping unnecessary columns
    traffic_data = traffic_data.rename(columns=traffic_data.iloc[0]) # renaming columns from row data
    traffic_data = traffic_data.iloc[1: , :] # dropping unnecessary columns
    traffic_data.columns.values[0] = "Link no" # renaming 1st column
    traffic_data.columns.values[1] = "Name" # renaming 2nd column
    traffic_data["Road"] = traffic_data["Link no"].str.split('-').str[0] # adding information about the road name in a specific column
    traffic_data.set_axis(column_names, axis='columns',inplace=True)

    appended_data_traffic = appended_data_traffic.append(traffic_data,ignore_index = True)
    

In [7]:
# checking structure of the dataframe

appended_data_traffic.head()

Unnamed: 0,Link no,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,...,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT),Road
0,N1-1L,Jatrabari - Int.with Z1101 (Left) (Left),LRPS,0,0.0,LRPS,822,0.822,0.822,402.0,...,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0,N1
1,N1-1R,Jatrabari - Int.with Z1101 (Left) (Right),LRPS,0,0.0,LRPS,822,0.822,0.822,660.0,...,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0,N1
2,N1-2L,Int.with Z1101 - Signboard (Left) R111 (Left),LRPS,822,0.822,LRPS,4175,4.175,3.353,660.0,...,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0,N1
3,N1-2R,Int.with Z1101 - Signboard (Left) R111 (Right),LRPS,822,0.822,LRPS,4175,4.175,3.353,402.0,...,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0,N1
4,N1-3L,Signboard - Shimrail (Left)R110 (Left),LRPS,4175,4.175,LRPS,7181,7.181,3.006,91.0,...,2266.0,1087.0,75.0,1198.0,0.0,16288.0,1273.0,17561.0,17561.0,N1


In [8]:
# checking structure of the dataframe

appended_data_traffic.tail()

Unnamed: 0,Link no,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,...,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT),Road
168,N208-1,Moulvibazar-Int.with Z2002,LRPS,0,0.0,LRPS,3190,3.19,3.19,15.0,...,3194.0,1123.0,147.0,486.0,0.0,7514.0,633.0,8147.0,8147.0,N208
169,N208-2,Int.with Z2002-Rajnagar (Int.with R281),LRPS,3190,3.19,LRP012,1661,13.461,10.271,15.0,...,3194.0,1123.0,147.0,486.0,0.0,7514.0,633.0,8147.0,8147.0,N208
170,N208-3,Rajnagar(Int.with R281)-Daudabad (Int.with Z2832),LRP012,1661,13.461,LRP052,385,51.05,37.589,0.0,...,2233.0,582.0,88.0,73.0,0.0,5328.0,161.0,5489.0,5489.0,N208
171,N208-4,Daudabad (Int.with Z2832)-Royal City Chottor(i...,LRP052,385,51.05,LRP054,677,53.342,2.292,0.0,...,2233.0,582.0,88.0,73.0,0.0,5328.0,161.0,5489.0,5489.0,N208
172,N208-5,Royal City Chottor(int.with N210)-Int.with N2,LRP054,677,53.342,LRP058,900,57.495,4.153,39.0,...,2974.0,1231.0,695.0,822.0,47.0,9488.0,1564.0,11052.0,11052.0,N208


In [9]:
# checking size of the dataframe

appended_data_traffic.shape

(173, 27)

In [10]:
# checking data types of dataframe

appended_data_traffic.dtypes

Link no                        object
Name                           object
Start location-LRP             object
Start location-Offset          object
Start location-Chainage        object
End location-LRP               object
End location-Offset            object
End location-Chainage          object
Length-(Km)                    object
Traffic Data-Heavy Truck       object
Traffic Data-Medium Truck      object
Traffic Data-Small Truck       object
Traffic Data-Large Bus         object
Traffic Data-Medium Bus        object
Traffic Data-Micro Bus         object
Traffic Data-Utility           object
Traffic Data-Car               object
Traffic Data-Auto Rickshaw     object
Traffic Data-Motor Cycle       object
Traffic Data-Bi-Cycle          object
Traffic Data-Cycle Rickshaw    object
Traffic Data-Cart              object
Total-Motorized                object
Total-Non Motorized            object
Total-Total AADT               object
Traffic-(AADT)                 object
Road        

From the above code we can see that many columns should be numeric, but are as objects, so we need to change the data types of all of them.

In [11]:
# creating function to change data type of columns

def change_data_type (dataframe,column):
    dataframe[column] = pd.to_numeric(dataframe[column])
    
    return(dataframe)


In [12]:
# Changing data types of clumns in final dataframe

numerical_cols = ['Start location-Offset','Start location-Chainage','End location-Offset','End location-Chainage','Length-(Km)',\
    'Traffic Data-Heavy Truck','Traffic Data-Medium Truck','Traffic Data-Small Truck','Traffic Data-Large Bus','Traffic Data-Medium Bus',\
    'Traffic Data-Micro Bus','Traffic Data-Utility','Traffic Data-Car','Traffic Data-Auto Rickshaw','Traffic Data-Motor Cycle',\
    'Traffic Data-Bi-Cycle','Traffic Data-Cycle Rickshaw','Traffic Data-Cart','Total-Motorized','Total-Non Motorized','Total-Total AADT','Traffic-(AADT)']

for elem in numerical_cols:
    appended_data_traffic = change_data_type(appended_data_traffic,elem)

In [13]:
# checking data types of dataframe

appended_data_traffic.dtypes

Link no                         object
Name                            object
Start location-LRP              object
Start location-Offset            int64
Start location-Chainage        float64
End location-LRP                object
End location-Offset              int64
End location-Chainage          float64
Length-(Km)                    float64
Traffic Data-Heavy Truck       float64
Traffic Data-Medium Truck      float64
Traffic Data-Small Truck       float64
Traffic Data-Large Bus         float64
Traffic Data-Medium Bus        float64
Traffic Data-Micro Bus         float64
Traffic Data-Utility           float64
Traffic Data-Car               float64
Traffic Data-Auto Rickshaw     float64
Traffic Data-Motor Cycle       float64
Traffic Data-Bi-Cycle          float64
Traffic Data-Cycle Rickshaw    float64
Traffic Data-Cart              float64
Total-Motorized                float64
Total-Non Motorized            float64
Total-Total AADT               float64
Traffic-(AADT)           

In [14]:
# adding a column to be the reference of the segment (start location-LRP - start location-LRP)

appended_data_traffic['Segment'] = appended_data_traffic[['Start location-LRP', 'End location-LRP']].agg('-'.join, axis='columns')


# rearranging columns order

cols = appended_data_traffic.columns.tolist()
cols.insert(1, cols.pop(cols.index('Road'))) # bringing the created Road column to the front so it's easier to be identified
cols.insert(2, cols.pop(cols.index('Segment'))) # bringing the created Segment column to the front so it's easier to be identified
appended_data_traffic = appended_data_traffic.reindex(columns= cols) #reordering columns

appended_data_traffic.head(10)

Unnamed: 0,Link no,Road,Segment,Name,Start location-LRP,Start location-Offset,Start location-Chainage,End location-LRP,End location-Offset,End location-Chainage,...,Traffic Data-Car,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT)
0,N1-1L,N1,LRPS-LRPS,Jatrabari - Int.with Z1101 (Left) (Left),LRPS,0,0.0,LRPS,822,0.82,...,1851.0,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0
1,N1-1R,N1,LRPS-LRPS,Jatrabari - Int.with Z1101 (Left) (Right),LRPS,0,0.0,LRPS,822,0.82,...,2608.0,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0
2,N1-2L,N1,LRPS-LRPS,Int.with Z1101 - Signboard (Left) R111 (Left),LRPS,822,0.82,LRPS,4175,4.17,...,2608.0,2508.0,436.0,213.0,1088.0,0.0,20236.0,1301.0,21537.0,21537.0
3,N1-2R,N1,LRPS-LRPS,Int.with Z1101 - Signboard (Left) R111 (Right),LRPS,822,0.82,LRPS,4175,4.17,...,1851.0,2980.0,398.0,232.0,889.0,0.0,18236.0,1121.0,19357.0,19357.0
4,N1-3L,N1,LRPS-LRPS,Signboard - Shimrail (Left)R110 (Left),LRPS,4175,4.17,LRPS,7181,7.18,...,1690.0,2266.0,1087.0,75.0,1198.0,0.0,16288.0,1273.0,17561.0,17561.0
5,N1-3R,N1,LRPS-LRPS,Signboard - Shimrail (Left)R110 (Right),LRPS,4175,4.17,LRPS,7181,7.18,...,1609.0,2157.0,1035.0,72.0,1140.0,0.0,15445.0,1212.0,16657.0,16657.0
6,N1-4L,N1,LRPS-LRP009,Shimrail - Katchpur (Left)N2 (Left),LRPS,7181,7.18,LRP009,260,8.76,...,1579.0,3154.0,1162.0,211.0,1077.0,0.0,16001.0,1288.0,17289.0,17289.0
7,N1-4R,N1,LRPS-LRP009,Shimrail - Katchpur (Left)N2 (Right),LRPS,7181,7.18,LRP009,260,8.76,...,1948.0,2693.0,1143.0,517.0,1531.0,0.0,15996.0,2048.0,18044.0,18044.0
8,N1-5L,N1,LRP009-LRP012,Katchpur - Madanpur (Left)N105 (Left),LRP009,260,8.76,LRP012,439,11.94,...,1579.0,3154.0,1154.0,211.0,1077.0,0.0,22591.0,1288.0,23879.0,23879.0
9,N1-5R,N1,LRP009-LRP012,Katchpur - Madanpur (Left)N105 (Right),LRP009,260,8.76,LRP012,439,11.94,...,2105.0,3080.0,1143.0,517.0,1531.0,0.0,22541.0,2048.0,24589.0,24589.0


## Numerical Operations

We want to do 2 things. First, we want to take averages of these traffic data per vehicle to be able to proportionate the number of trucks generated per tick. The main idea is that for example if there needs to be 300 trucks generated in 30 minutes and the tick time is 1 minute, then there must be 3 trucks generated each minute. Thus, what we will do is that take the number of total vechicle type on road / total number of vehicles. Second, we will use groupby to calculate the AADT values of each road - look at how many trucks etc.(with high economical importance) pass to be able to make some understanding of it.

Thus, let us start with taking the averages.

### Proportions

In [15]:
#appended_data_traffic.to_csv("to-be-deleted-traffic-data.csv")

In [16]:
list_of_columns = ['Traffic Data-Heavy Truck','Traffic Data-Medium Truck','Traffic Data-Small Truck',\
                                     'Traffic Data-Large Bus','Traffic Data-Medium Bus',\
                                     'Traffic Data-Micro Bus','Traffic Data-Utility','Traffic Data-Car']
total = 0

proportions_wrt_list_of_columns = []

for i in list_of_columns:
    currentsum = appended_data_traffic[i].sum()
    total += currentsum
# We found the total now we take averages per column

for i in list_of_columns:
    currentsum = appended_data_traffic[i].sum()
    currentproportion = (currentsum / total) * 100 #Get a percentage
    proportions_wrt_list_of_columns.append(currentproportion)
    
proportions_wrt_list_of_columns

#Put these percentages as a dictionary. 

formatted_proportions = [ '%.2f' % elem for elem in proportions_wrt_list_of_columns]

dictionary_proportions = {}

for i in range(0,len(formatted_proportions)):
    dictionary_proportions[list_of_columns[i]] = formatted_proportions[i] 
    
dictionary_proportions
#I excluded some knowingly, I think Car can also be economically feasible because people go to work and stuff.

{'Traffic Data-Heavy Truck': '3.12',
 'Traffic Data-Medium Truck': '36.59',
 'Traffic Data-Small Truck': '11.84',
 'Traffic Data-Large Bus': '14.80',
 'Traffic Data-Medium Bus': '6.49',
 'Traffic Data-Micro Bus': '12.93',
 'Traffic Data-Utility': '3.14',
 'Traffic Data-Car': '11.10'}

The proportions are done. The trucks will be created with respect to these in the model. <span style="color:red">After implementation write few lines here</span>.

### Groupsums

As discussed, to understand the vulnerability of different roads and their segments one important idea is to sum with respect to roads first to get an idea of which roads are interesting.

However we will to first manipulate the Traffic AADT value, excluding anything but the ones from the list above.

In [17]:
appended_data_traffic2 = appended_data_traffic.copy()

In [18]:
to_subtract = (appended_data_traffic2['Traffic Data-Auto Rickshaw'] + appended_data_traffic2['Traffic Data-Motor Cycle'] + \
               appended_data_traffic2['Traffic Data-Bi-Cycle'] + appended_data_traffic2['Traffic Data-Cycle Rickshaw'] + \
               appended_data_traffic2['Traffic Data-Cart'])

appended_data_traffic2['Traffic-(AADT)'] -= to_subtract
#We already have Total-Total AADT to realize how much difference this made. 
#Also one important thing is to talk about how feasible it is to exclude the bicycles etc. Maybe it is important in Bangladesh context? 

In [19]:
appended_data_traffic_road = appended_data_traffic2.groupby('Road').sum().sort_values(by = 'Traffic-(AADT)', ascending = False)
appended_data_traffic_road

Unnamed: 0_level_0,Start location-Offset,Start location-Chainage,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,Traffic Data-Medium Truck,Traffic Data-Small Truck,Traffic Data-Large Bus,Traffic Data-Medium Bus,...,Traffic Data-Car,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT)
Road,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N1,406764,16346.47,413294,17037.13,690.66,28876.0,386871.0,110946.0,153477.0,67567.0,...,107130.0,218803.0,53619.0,18238.0,58342.0,14.0,1274822.0,76594.0,1351416.0,1002400.0
N2,32370,3645.26,32948,3931.77,286.52,7858.0,55655.0,24451.0,35468.0,10727.0,...,25581.0,79545.0,18111.0,6618.0,19254.0,78.0,299373.0,25950.0,325323.0,201717.0
N105,4029,151.1,4029,199.98,48.88,2094.0,26330.0,8606.0,532.0,459.0,...,6500.0,15195.0,4013.0,844.0,5350.0,0.0,71774.0,6194.0,77968.0,52566.0
N102,15223,408.3,15223,491.68,83.38,2371.0,10926.0,6764.0,4837.0,1342.0,...,2952.0,43223.0,6369.0,1711.0,6390.0,0.0,86044.0,8101.0,94145.0,36452.0
N104,13543,228.69,13543,278.32,49.63,739.0,9181.0,5074.0,4608.0,3962.0,...,2163.0,45133.0,8617.0,3346.0,11660.0,32.0,85504.0,15038.0,100542.0,31754.0
N208,5913,121.04,6813,178.54,57.49,69.0,3406.0,2484.0,184.0,2928.0,...,3882.0,13828.0,4641.0,1165.0,1940.0,47.0,35172.0,3152.0,38324.0,16703.0
N207,2466,70.23,2466,138.21,67.98,124.0,1692.0,1248.0,792.0,568.0,...,1544.0,10376.0,1868.0,564.0,600.0,0.0,20152.0,1164.0,21316.0,7908.0
N204,5439,25.45,5439,59.87,34.42,0.0,352.0,380.0,62.0,76.0,...,184.0,2238.0,756.0,254.0,460.0,0.0,4542.0,714.0,5256.0,1548.0


In [20]:
appended_data_traffic_segment = appended_data_traffic2.groupby(['Road','Segment']).sum().sort_values(by = 'Traffic-(AADT)', ascending=False) #However, watch out for LRPS-LRPS (discuss this) 
appended_data_traffic_segment

Unnamed: 0_level_0,Unnamed: 1_level_0,Start location-Offset,Start location-Chainage,End location-Offset,End location-Chainage,Length-(Km),Traffic Data-Heavy Truck,Traffic Data-Medium Truck,Traffic Data-Small Truck,Traffic Data-Large Bus,Traffic Data-Medium Bus,...,Traffic Data-Car,Traffic Data-Auto Rickshaw,Traffic Data-Motor Cycle,Traffic Data-Bi-Cycle,Traffic Data-Cycle Rickshaw,Traffic Data-Cart,Total-Motorized,Total-Non Motorized,Total-Total AADT,Traffic-(AADT)
Road,Segment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
N1,LRPS-LRPS,9994,9.99,24356,24.36,14.36,2300.00,29244.00,9639.00,16118.00,6455.00,...,12217.00,15399.00,3790.00,1037.00,6292.00,0.00,108677.00,7329.00,116006.00,89488.00
N1,LRP076-LRP076,43690,486.66,67244,510.22,23.55,1932.00,32574.00,4329.00,11061.00,2784.00,...,6771.00,11709.00,3327.00,672.00,1074.00,0.00,82899.00,1746.00,84645.00,67863.00
N1,LRP043-LRP043,42564,296.18,76344,329.96,33.78,2121.00,29139.00,5337.00,9792.00,3594.00,...,6525.00,4758.00,990.00,210.00,759.00,0.00,68823.00,969.00,69792.00,63075.00
N1,LRP009-LRP012,520,17.53,878,23.87,6.35,327.00,9384.00,6699.00,6176.00,5199.00,...,3684.00,6234.00,2297.00,728.00,2608.00,0.00,45132.00,3336.00,48468.00,36601.00
N1,LRP013-LRP013,6822,31.87,15040,40.09,8.22,327.00,9384.00,6699.00,6176.00,5199.00,...,3684.00,6234.00,2297.00,728.00,2608.00,0.00,45132.00,3336.00,48468.00,36601.00
N1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N1,LRP420-LRP433,900,416.04,521,428.91,12.86,24.00,455.00,137.00,129.00,200.00,...,142.00,2476.00,419.00,97.00,1042.00,0.00,4681.00,1139.00,5820.00,1786.00
N1,LRP433-LRP467,521,428.91,130,462.25,33.35,24.00,455.00,137.00,129.00,200.00,...,142.00,2476.00,419.00,97.00,1042.00,0.00,4681.00,1139.00,5820.00,1786.00
N1,LRP413-LRP420,300,408.43,900,416.04,7.61,24.00,455.00,137.00,129.00,200.00,...,142.00,2476.00,419.00,97.00,1042.00,0.00,4681.00,1139.00,5820.00,1786.00
N204,LRP022-LRPE,5439,25.45,0,34.42,8.97,0.00,176.00,190.00,31.00,38.00,...,92.00,1119.00,378.00,127.00,230.00,0.00,2271.00,357.00,2628.00,774.00


The rest is looking at the data and analyzing. This must be further discussed.

## Verification of the Data

First we will check whether the segmentation makes sense in the data

NOTE TO ALEX : I THINK WE SHOULD USE ONLY N1-1R (1R is the segment number, and not use both LRP names) 

In [21]:
print("Process finished --- %s seconds ---" % (time.time() - start_time))

Process finished --- 2.3034493923187256 seconds ---
