## Data Cleaning

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
raw_df = pd.read_csv('df.csv')

  raw_df = pd.read_csv('df.csv')


### Removing columns from the Users Table

In [3]:
# Drop columns from the users table

drop_user_cols = ['vehicle_id', 'vehicle_number',
       'seat_position', 'user_category', 'sex',
       'birth_year', 'trip_purpose', 'safety_device_1', 'safety_device_2',
       'safety_device_3', 'pedestrian_location', 'pedestrian_action',
       'pedestrian_alone', 'user_id']

raw_df = raw_df.drop(columns = drop_user_cols)

### Removing rows with missing location and date info

In [4]:
# Drop rows where 'date' is missing. These rows are all missing lat/long data as well.
raw_df = raw_df.dropna(subset=['date'])

### Cleaning the hour_min column

In [5]:
# Extract hour and rename column
raw_df['hour'] = raw_df['hour_minute'].str.split(':').str[0].astype(int)

# Drop the original 'hour_minute' column
raw_df = raw_df.drop(columns=['hour_minute'])

# Move 'hour' to the third column
cols = raw_df.columns.tolist()
cols.insert(2, cols.pop(cols.index('hour')))
raw_df = raw_df[cols]

### Create a copy of the df to clean

In [6]:
# Create a copy of the df
clean_df = raw_df.copy()

## Cleaning the columns from the Places dataset (Angie)
### Cleaning the road_category column

In [7]:
# Recategorize road_category column

road_category_mapping = {
    1: 'Major Roads',
    2: 'Major Roads',
    3: 'Secondary Roads',
    7: 'Secondary Roads',
    4: 'Local & Access Roads',
    6: 'Local & Access Roads',
    5: 'Other / Off-Network',
    9: 'Other / Off-Network'
}

clean_df['road_category'] = clean_df['road_category'].map(road_category_mapping)

In [8]:
clean_df['road_category'].value_counts()

road_category
Local & Access Roads    103511
Secondary Roads          93345
Major Roads              35309
Other / Off-Network       1797
Name: count, dtype: int64

### Cleaning the road_layout column

In [9]:
# Recategorize road layout column
# Unknown values impluted with 'Two Way' (most common value)

road_layout_mapping = {
    -1: 'Two Way',
    1: 'One Way',
    2: 'Two Way',
    3: 'Multi Lane',
    4: 'Multi Lane'
}

clean_df['road_layout'] = clean_df['road_layout'].map(road_layout_mapping)

In [10]:
clean_df['road_layout'].value_counts()

road_layout
Two Way       158702
One Way        42681
Multi Lane     32579
Name: count, dtype: int64

### Cleaning the num_lanes column

In [11]:
# Convert num_lanes from object to int

def clean_to_int(x):
    try:
        # Remove whitespace, then convert to int
        return int(str(x).strip())
    except:
        # If conversion fails, classify as -1
        return -1

clean_df['num_lanes'] = clean_df['num_lanes'].apply(clean_to_int)


# Replace lanes that are 0 or -1 with 2 (most common value)
clean_df.loc[(clean_df['num_lanes'] < 1), 'num_lanes'] = 2

In [12]:
clean_df['num_lanes'].value_counts()

num_lanes
2     153827
4      25437
1      25185
3      17434
6       5470
5       3334
8       1720
7        658
10       451
9        279
12        89
11        78
Name: count, dtype: int64

### Cleaning the reserved_lane column

In [13]:
# Recategorize reserved_lane column
# Unknown values impluted with 'none' (most common value)

reserved_lane_mapping = {
    -1: 'None',
    0: 'None',
    1: 'Cycle Lane',
    2: 'Cycle Lane',
    3: 'Reserved Lane'
}

clean_df['reserved_lane'] = clean_df['reserved_lane'].map(reserved_lane_mapping)

In [14]:
clean_df['reserved_lane'].value_counts()

reserved_lane
None             208999
Cycle Lane        17079
Reserved Lane      7884
Name: count, dtype: int64

### Cleaning the road_profile column

In [15]:
# Recategorize road_profile column
# Unknown values impluted with 'flat' (most common value)

road_profile_mapping = {
    -1: 'Flat',
    1: 'Flat',
    2: 'Slope / Near Slope',
    3: 'Slope / Near Slope',
    4: 'Slope / Near Slope'
}

clean_df['road_profile'] = clean_df['road_profile'].map(road_profile_mapping)

In [16]:
clean_df['road_profile'].value_counts()

road_profile
Flat                  189884
Slope / Near Slope     44078
Name: count, dtype: int64

### Cleaning the road_shape column

In [17]:
# Recategorize road_shape column
# Unknown values impluted with 'straight' (most common value)

road_shape_mapping = {
    -1: 'Straight',
    1: 'Straight',
    2: 'Curved',
    3: 'Curved',
    4: 'Curved'
}

clean_df['road_shape'] = clean_df['road_shape'].map(road_shape_mapping)

In [18]:
clean_df['road_shape'].value_counts()

road_shape
Straight    189643
Curved       44319
Name: count, dtype: int64

### Cleaning the surface_condition column

In [19]:
# Recategorize surface_condition column
# Unknown values impluted with 'normal' (most common value)

surface_condition_mapping = {
    -1: 'Normal',
    1: 'Normal',
    2: 'Wet / Slippery',
    3: 'Wet / Slippery',
    4: 'Wet / Slippery',
    5: 'Wet / Slippery',
    6: 'Wet / Slippery',
    7: 'Wet / Slippery',
    8: 'Wet / Slippery',
    9: 'Wet / Slippery'
}

clean_df['surface_condition'] = clean_df['surface_condition'].map(surface_condition_mapping)

In [20]:
clean_df.surface_condition.value_counts()

surface_condition
Normal            186385
Wet / Slippery     47577
Name: count, dtype: int64

### Cleaning the infrastructure column

In [21]:
# Recategorize infrastructure column
# Unknown values impluted with 'none' (most common value)

infrastructure_mapping = {
    -1: 'None',
    0: 'None',
    1: 'Tunnel / Bridge',
    2: 'Tunnel / Bridge',
    3: 'Intersections',
    4: 'Intersections',
    5: 'Intersections',
    6: 'Intersections',
    7: 'Other',
    8: 'Other',
    9: 'Other'
}

clean_df['infrastructure'] = clean_df['infrastructure'].map(infrastructure_mapping)

In [22]:
clean_df.infrastructure.value_counts()

infrastructure
None               197635
Intersections       19655
Other               10206
Tunnel / Bridge      6466
Name: count, dtype: int64

### Cleaning the road_location column

In [23]:
# Recategorize road_location column
# Unknown values impluted with 'Road' (most common value)

road_location_mapping = {
    -1: 'Road',
    0: 'Road',
    1: 'Road',
    2: 'Reserved Lanes',
    3: 'Reserved Lanes',
    4: 'Cyclist / Pedestrian',
    5: 'Cyclist / Pedestrian',
    6: 'Reserved Lanes',
    8: 'Other'
}

clean_df['road_location'] = clean_df['road_location'].map(road_location_mapping)

In [24]:
clean_df.road_location.value_counts()

road_location
Road                    196430
Reserved Lanes           20629
Cyclist / Pedestrian      9540
Other                     7363
Name: count, dtype: int64

### Cleaning the speed_limit column

In [None]:
# Round 'speed_limit' to nearest 10
clean_df['speed_limit'] = ((clean_df['speed_limit'] / 10).round(0) * 10).astype(int)

# There are rows where speed limit is between 130 and 200. Impute it with 130, assuming these are highways.
clean_df.loc[(clean_df['speed_limit'] > 130) & (clean_df['speed_limit'] < 200), 'speed_limit'] = 130

# There are rows where speed limit is over 200. Impute it with the median speed (50kmh), assuming these are input errors.
median_speed = clean_df[clean_df['speed_limit']<=130]['speed_limit'].median()

clean_df.loc[(clean_df['speed_limit'] > 130), 'speed_limit'] = median_speed

# Impute missing speed limits with 50.
clean_df.loc[(clean_df['speed_limit'] < 1), 'speed_limit'] = 50

In [26]:
clean_df['speed_limit'].value_counts()

speed_limit
50     129294
80      32138
30      27124
70      15679
90      15591
110      7781
130      4087
20        898
60        669
10        374
40        292
100        34
120         1
Name: count, dtype: int64

### Dropping irrelevant columns from the Places dataset

In [27]:
# Drop columns from the places table

drop_places_cols = ['lane_type',
                    'numerical_index_road',
                    'alphanumeric_index_road',
                    'road_ref_1',
                    'road_ref_2',
                    'width_central_reservation',
                    'width_carriageway'
                    ]

clean_df = clean_df.drop(columns = drop_places_cols)

Notes:
* `road_layout` & `num_lanes` are redundant information. 

Columns dropped with justification:
* `lane_type`: (this column was named incorrectly). It was originally 'voie' describing road number and won't impact our model.
* `numerical_index_road`: doesn't affect road characteristics and risk
* `alphanumeric_index_road`: doesn't affect road characteristics and risk
* `road_ref_1`: doesn't affect road characteristics and risk
* `road_ref_2`: doesn't affect road characteristics and risk
* `width_central_reservation`: too many null values
* `width_carriageway`: too many null values

## Cleaning the columns from the Characteristics dataset (Yae)

Note from Angie to Yae. You can delete this once you've read.
These are the remaining columns you need to do:

'light_conditions','department', 'commune', 'urban_area', 'intersection_type', 
'weather', 'collision_type', 'road_address', 'latitude', 'longitude',
'accident_uid'

below are the original and new column names from when I merged the data for your reference

    'lum': 'light_conditions',
    'dep': 'department',
    'com': 'commune',
    'agg': 'urban_area',
    'int': 'intersection_type',
    'atm': 'weather',
    'col': 'collision_type',
    'adr': 'road_address',
    'lat': 'latitude',
    'long': 'longitude',
    'Accident_Id': 'accident_uid'

## Export clean_df to csv for exploratory data analysis

In [28]:
# Export df to CSV
clean_df.to_csv('clean_df.csv', index=False)