In [62]:
import pandas as pd
import geopandas as gpd
import numpy as np
import shapely.wkt as wkt
import math
from shapely.geometry import Point, LineString, Polygon
from matplotlib import pyplot as plt

In [63]:
# read in the cluster data
rd_df = pd.read_csv('data/cleaned_data/agg_clusters.csv')

Now rd_df has a lot of data in it that I don't need for analysis, but may need later for visualization. First I drop all the columns that aren't going to be used for analysis.

In [64]:
model_df = rd_df.drop(columns=['Unnamed: 0.1.1', 'Unnamed: 0', 'Unnamed: 0.1',  'Unnamed: 0_2', 'Pd_Sf_Type', 
                               'Pd_Fc_Type', 'Cr_Slope', 'Truck', 'Trk_Netwrk', 'Trk_Netwrk', 'Path_Width', 
                               'Unnamed: 0_1', 'Statn_Num', 'StationID', 'shape', 'bike_acc_ids',
                              'car_acc_ids', 'Route_ID', 'From_Measure', 'To_Measure', 'Route_System', 'Route_Number',
                               'Route_Direction', 'Rd_Seg_ID', 'Route_Number', 'Mile_Count', 'Speed', 'Trk_Permit',
                               'St_Name', 'Fm_St_Name', 'To_St_Name', 'Length', 'Shape_Length', 'road_shape', 'geometry',
                               'road_line', 'Toll_Road', 'MHS', 'AADT_Year', 'AADT_Deriv', 'Fd_Aid_Rd'
                              ])
model_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74122 entries, 0 to 74121
Data columns (total 43 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   car_acc_score   74122 non-null  float64
 1   bike_acc_score  74122 non-null  float64
 2   Facility        65737 non-null  float64
 3   F_Class         72378 non-null  float64
 4   Urban_Area      72374 non-null  float64
 5   Urban_Type      72374 non-null  float64
 6   F_F_Class       72378 non-null  float64
 7   Jurisdictn      71241 non-null  object 
 8   NHS             72303 non-null  float64
 9   Control         72020 non-null  float64
 10  Num_Lanes       72301 non-null  float64
 11  Opp_Lanes       72301 non-null  float64
 12  Surface_Tp      72071 non-null  float64
 13  Surface_Wd      72071 non-null  float64
 14  Shldr_Rt_W      72078 non-null  float64
 15  Shldr_Rt_T      72078 non-null  float64
 16  Shldr_Lt_W      72071 non-null  float64
 17  Shldr_Lt_T      72071 non-null 

In [65]:
# convert car and bike accident scores to integers
model_df['car_acc_score'] = model_df['car_acc_score'].astype(int)
model_df['bike_acc_score'] = model_df['bike_acc_score'].astype(int)

In [66]:
# fill in null values in string codes with 'NA'

list_to_convert = ['Jurisdictn', 'MPO', 'RPA', 'RTA']

for col in list_to_convert:
    model_df[col] = model_df[col].fillna(value='NA')

In [67]:
# a large number of these columns are IDs that stand for qualitative values
# this function fills null values with -1 and converts the column to integers

# -1 also makes sense as a fill value for fields like number of lanes

def fill_and_convert(data, column):
    data[column] = data[column].fillna(value=-1)
    data[column] = data[column].astype(int)
    return True

In [68]:
list_to_convert = ['Facility', 'F_Class', 'Urban_Area', 'Urban_Type', 'F_F_Class', 'NHS', 'Control', 'Num_Lanes',
                  'Opp_Lanes', 'Surface_Tp', 'Surface_Wd', 'Shldr_Rt_W', 'Shldr_Rt_T', 'Shldr_Lt_W', 'Shldr_Lt_T',
                   'Shldr_UL_W', 'Shldr_UL_T', 'Med_Width', 'Med_Type', 'Curb', 'Lt_Sidewlk', 'Rt_Sidewlk', 
                   'Operation', 'Speed_Lim', 'Op_Dir_SL', 'T_Exc_Type', 'T_Exc_Time', 'Truck_Rte',  'ROW_Width',
                   'Struct_Cnd', 'Terrain', 'City', 'Mun_Type', 'County', 'Hwy_Dist', 'City_Maint']

for col in list_to_convert:
    fill_and_convert(model_df, col)

model_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74122 entries, 0 to 74121
Data columns (total 43 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   car_acc_score   74122 non-null  int32  
 1   bike_acc_score  74122 non-null  int32  
 2   Facility        74122 non-null  int32  
 3   F_Class         74122 non-null  int32  
 4   Urban_Area      74122 non-null  int32  
 5   Urban_Type      74122 non-null  int32  
 6   F_F_Class       74122 non-null  int32  
 7   Jurisdictn      74122 non-null  object 
 8   NHS             74122 non-null  int32  
 9   Control         74122 non-null  int32  
 10  Num_Lanes       74122 non-null  int32  
 11  Opp_Lanes       74122 non-null  int32  
 12  Surface_Tp      74122 non-null  int32  
 13  Surface_Wd      74122 non-null  int32  
 14  Shldr_Rt_W      74122 non-null  int32  
 15  Shldr_Rt_T      74122 non-null  int32  
 16  Shldr_Lt_W      74122 non-null  int32  
 17  Shldr_Lt_T      74122 non-null 

In [69]:
model_df['AADT'].describe()

# AADT is a traffic volume, therefore we don't want to just add negative numbers
# Maybe later I'll come up with a fancy way to add missing values, but for now, I'll create a separate data set
# only containing rows without a null AADT


count     61950.000000
mean      13968.546893
std       26294.292929
min          10.000000
25%        1154.000000
50%        3720.000000
75%       15131.000000
max      224649.000000
Name: AADT, dtype: float64

In [70]:
# Drop traffic column and output data with no null values

road_qualities = model_df.drop(columns=['AADT'])
road_qualities.to_csv(r'data/cleaned_data/road_qualities.csv')

In [75]:
# Create a dataframe with only traffic, drop rows with nulls and export

traffic = model_df[['car_acc_score', 'bike_acc_score', 'AADT']].dropna()
len(traffic)

61950

In [78]:
traffic.to_csv(r'data/cleaned_data/traffic.csv')