In [None]:
#https://www.kaggle.com/daveianhickey/2000-16-traffic-flow-england-scotland-wales

In [None]:
import os  # for checking file size

import pandas as pd  # for dataframe
import tkinter  # for selecting folder
import tkinter.filedialog  # for selecting folder
import tkinter.messagebox # for displaying error message
import shutil  # for concatenating files
import re  # for selecting correct files
from itertools import groupby  # check if all headers are equal
import folium # for drawing map
from folium.plugins import FastMarkerCluster # for plotting many values
import ntpath # for setting filename os-independently
# for mapping accident locations to police forces
import json
from turfpy.measurement import boolean_point_in_polygon
from geojson import Point, MultiPolygon, Feature
import datetime # for converting to datetime
import matplotlib # for viewing plotting settings

output_foldername = 'output'
output_filename = 'output.csv'
error_msg = "You have to specify a data path first!"

In [None]:
global data_file_paths  # path to data files
root = tkinter.Tk()
data_path = tkinter.filedialog.askdirectory(mustexist=True)
root.destroy()
root.mainloop()

In [None]:
os.chdir(data_path)
data_file_paths = list()
regex = '[1-9]{1}[0-9]{2}[1-9]{1}'
with os.scandir(data_path) as it:
    for entry in it:
        if entry.is_file() and entry.name.endswith(".csv") and re.search(regex, entry.name):
            data_file_paths.append(entry.path)
if not len(data_file_paths) > 0:
    raise RuntimeError("No files found in folder " + data_path)

In [None]:
tmp = os.path.dirname(data_file_paths[0])
foldername = os.path.dirname(tmp)
output_path = os.path.join(foldername, output_foldername)
os.makedirs(output_path, exist_ok=True)
global concatenated_path
concatenated_path = os.path.join(output_path, output_filename)
global accident_data

In [None]:
def check_headers_equal():
    # checking if headers are equal
    first_lines = list()
    for file in data_file_paths:
        with open(file, 'r') as f:
            first_line = f.readline()
            first_lines.append(first_line)

    def all_equal(iterable):
        g = groupby(iterable)
        return next(g, True) and not next(g, False)

    if not all_equal(first_lines):
        raise RuntimeError("Headers are not equal")

In [None]:
check_headers_equal()
print("Converting files to dataframe...")
accident_data = pd.concat([pd.read_csv(
    file, index_col='Accident_Index', parse_dates=True) for file in data_file_paths]).reset_index()
print("Finished converting files to csv")
print('--------------------------------------------------------------------------------')

In [None]:
length = len(data_file_paths)
total_filesize = 0
for file in data_file_paths:
    total_filesize += os.stat(file).st_size
concatenated_filesize = os.stat(concatenated_path).st_size
if os.path.isfile(concatenated_path) and concatenated_filesize != total_filesize:
    with open(concatenated_path, 'wb') as wfd:
        for f in data_file_paths:
            index = data_file_paths.index(f) + 1
            print("Concatenating file", index,
                    "of", length, "files...")
            with open(f, 'rb') as fd:
                shutil.copyfileobj(fd, wfd)
    print("Finished concatenating files")
else:
    print("No concatenation necessary")

In [None]:
# count missing values
missing_table = accident_data.isnull().sum()
sorted_missing = missing_table.sort_values(ascending=False)
print(sorted_missing)

In [None]:
accident_loc_data = accident_data.copy()
accident_loc_data = accident_loc_data[['Latitude', 'Longitude']]
accident_loc_data = accident_loc_data.dropna(axis=0, subset=['Latitude','Longitude'])

In [None]:
locations = accident_loc_data[['Latitude', 'Longitude']]
locations['Latitude'] = locations['Latitude'].astype(float)
locations['Longitude'] = locations['Longitude'].astype(float)
locationlist = locations.values.tolist()
print(len(locationlist))

In [None]:
london_lat, london_lng = 51.5074, -0.1278
min_lat, min_lon = london_lat - 2, london_lng - 9
max_lat, max_lon = london_lat + 10, london_lng + 1
my_map = folium.Map(
    zoom_start=5,
    min_zoom=5,
    max_bounds=True,
    min_lat=min_lat,
    max_lat=max_lat,
    min_lon=min_lon,
    max_lon=max_lon
    )

map_output_path = output_path + os.path.sep + 'uk_map.html'
my_map.save(map_output_path)

In [None]:
# add police district boundaries
district_boundaries_filelist = list()
police_districts_folder = foldername + os.path.sep + 'police districts'
with os.scandir(police_districts_folder) as it:
    for entry in it:
        if entry.is_file() and os.stat(entry.path).st_size > 0:
            if entry.name.endswith(".geojson"):
                district_boundaries_filelist.append(entry.path)

In [None]:
# add all the points from the file to the map object using FastMarkerCluster
my_map.add_child(FastMarkerCluster(locationlist))

# save the map
map_output_path = output_path + os.path.sep + 'accident_map_boundaries.html'
my_map.save(map_output_path)

In [None]:
# Accidents by Speed Limit
accident_data.Speed_limit.value_counts().sort_index()

In [None]:
# map coordinates to police forces
file_list = list()
with_police_forces = accident_data.copy()
with_police_forces['police_frce'] = 'pf1'

def search_accident(data):
    for feature in data['features']:
        polygon = Feature(geometry=MultiPolygon([feature['geometry']['coordinates']], precision=15))
        for idx, accident_line in enumerate(with_police_forces):
            latitude = with_police_forces.loc[idx, 'Latitude']
            longitude = with_police_forces.loc[idx, 'Longitude']
            point = Feature(geometry=Point([latitude, longitude]))
            if boolean_point_in_polygon(point, polygon):
                print(feature['properties']['PFA20NM'])
                #with_police_forces.loc[idx, 'police_frce'] = feature['properties']['PFA20NM']


with open(district_boundaries_filelist[0], 'r') as f:
    data = json.load(f)
    search_accident(data)

with_police_forces.to_csv("withAddedColumn.csv")

In [None]:
def full_date(row):
    try:
        newdate = datetime.datetime.strptime(row['Date'] + ' ' + str(row['Time']), '%d/%m/%Y %H:%M') 
    except:
        row['Time'] = '00:00'
        newdate = datetime.datetime.strptime(row['Date'] + ' ' + str(row['Time']), '%d/%m/%Y %H:%M')
    return newdate

In [None]:
accident_data['date_time'] = accident_data.apply(lambda row: full_date(row),axis=1)
accident_data['date_time'] = pd.to_datetime(accident_data['date_time'])

In [None]:
# number of accidents Time of the Day
accident_data['date_time'].groupby(accident_data['date_time'].dt.hour).count().plot(
    kind="bar", xlabel="Time of day", ylabel="Number of accidents", title="Accidents per hour")


In [None]:
# Number of accidents day of week
accident_data['Day_of_Week'].groupby(accident_data['Day_of_Week']).count().plot(
    kind="bar", xlabel="Day of Week", ylabel="Number of accidents", title="Accidents per day of week")

In [None]:
# number of accidents per month
accident_data['date_time'].groupby(accident_data['date_time'].dt.month).count().plot(
    kind="bar", xlabel="Month", ylabel="Number of accidents", title="Accidents per month")

In [None]:
# number of accidents per weather condition
accident_data['Weather_Conditions'].groupby(accident_data['Weather_Conditions']).count().plot(
    kind="bar", xlabel="Weather condition", ylabel="Number of accidents", title="Accidents per weather condition")