In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = ':https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F43428%2F77110%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240428%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240428T164558Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3e9717327bb24c619c3021e442a440868fcf142d5d79d1cae046a688b6dfee33cb5673399632dd8b6c1af607e96256655a56e57b5a5a901983b4cc15032906b454563fffb4e20ba747550af2a71f1325331fff9677c0f9465a6347f0fb62554811d4493ec1d38c56d30db8e93ce93b87a51ba74264b8c88efc025eeaa6d11f094aba0785a661d335771a6ccc51ec0204c0c30650d195ebedc37ee6172794c1b760b8dfe3d42c835730b7b66c4662486d0364d9865573371448023fe8bbab050874f29677dd8461404d50a71f84688ea4307e672e2d37c62f3507cb86068c651aa138bf946dbc603990d88226b16a75fe6beca9e4b7561f51c0b9b3ca2dba855e'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading , 136559115 bytes compressed
Downloaded and uncompressed: 
Data source import complete.


### Get the amount of accidents per LSOA , so we can analyze dangerous regions
* Could aggregate at more granular level - per junction/area (e.g. based on LatLong rounding), or by road number !

* For a similar project, see our Anyway/Public knowledge / Datahack hackathon project:
    * https://github.com/hasadna/anyway
    * https://github.com/ddofer?tab=repositories
    
    
* We will ignore "slight" ('fender bender') accidents for now, but any model would benefit from them , and they could still be of interest.

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input"))

['Vehicle_Information.csv', 'Accident_Information.csv']


In [3]:
cols_keep = ['Accident_Severity', 'Date','Time', 'Latitude','Longitude',
             'Local_Authority_(District)', 'Local_Authority_(Highway)',
            'LSOA_of_Accident_Location', 'Number_of_Casualties', "1st_Road_Number","2nd_Road_Number"]

In [4]:
df = pd.read_csv('../input/Accident_Information.csv',usecols=cols_keep, #nrows=12345,
                 parse_dates=[['Date', 'Time']],keep_date_col=True)
df.shape

  df = pd.read_csv('../input/Accident_Information.csv',usecols=cols_keep, #nrows=12345,


(1917274, 12)

In [5]:
df["Date_Time"] = pd.to_datetime(df["Date_Time"],infer_datetime_format=True,errors="coerce")

  df["Date_Time"] = pd.to_datetime(df["Date_Time"],infer_datetime_format=True,errors="coerce")


In [6]:
# we see that some cases lack a time of events - creating a bad date format. we'll fix these

df.loc[df['Date_Time'].isna(), 'Date_Time'] = df["Date"]
df.loc[df["Date_Time"].isna()]

Unnamed: 0,Date_Time,1st_Road_Number,2nd_Road_Number,Accident_Severity,Date,Latitude,Local_Authority_(District),Local_Authority_(Highway),Longitude,LSOA_of_Accident_Location,Number_of_Casualties,Time


In [7]:
df.drop(["Date","Time"],axis=1,inplace=True)
df.set_index("Date_Time",inplace=True)
df.index = pd.to_datetime(df.index)

In [8]:
df["serious_accident"] = df.Accident_Severity != "Slight"

In [9]:
df.nunique()

1st_Road_Number                  7126
2nd_Road_Number                  7479
Accident_Severity                   3
Latitude                      1245780
Local_Authority_(District)        416
Local_Authority_(Highway)         207
Longitude                     1332583
LSOA_of_Accident_Location       35542
Number_of_Casualties               52
serious_accident                    2
dtype: int64

In [10]:
df.columns

Index(['1st_Road_Number', '2nd_Road_Number', 'Accident_Severity', 'Latitude',
       'Local_Authority_(District)', 'Local_Authority_(Highway)', 'Longitude',
       'LSOA_of_Accident_Location', 'Number_of_Casualties',
       'serious_accident'],
      dtype='object')

In [11]:
df.describe()

Unnamed: 0,1st_Road_Number,2nd_Road_Number,Latitude,Longitude,Number_of_Casualties
count,1917272.0,1899834.0,1917129.0,1917128.0,1917274.0
mean,1001.601,377.6881,52.56794,-1.419765,1.3479
std,1816.643,1296.353,1.449316,1.403532,0.8213597
min,0.0,0.0,49.91294,-7.516225,1.0
25%,0.0,0.0,51.48663,-2.341349,1.0
50%,124.0,0.0,52.25458,-1.374138,1.0
75%,720.0,0.0,53.45975,-0.2111217,1.0
max,9999.0,9999.0,60.75754,1.76201,93.0


In [12]:
df.index.dtype

dtype('<M8[ns]')

In [13]:
df.head()

Unnamed: 0_level_0,1st_Road_Number,2nd_Road_Number,Accident_Severity,Latitude,Local_Authority_(District),Local_Authority_(Highway),Longitude,LSOA_of_Accident_Location,Number_of_Casualties,serious_accident
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2005-01-04 17:42:00,3218.0,0.0,Serious,51.489096,Kensington and Chelsea,Kensington and Chelsea,-0.19117,E01002849,1,True
2005-01-05 17:36:00,450.0,0.0,Slight,51.520075,Kensington and Chelsea,Kensington and Chelsea,-0.211708,E01002909,1,False
2005-01-06 00:15:00,0.0,0.0,Slight,51.525301,Kensington and Chelsea,Kensington and Chelsea,-0.206458,E01002857,1,False
2005-01-07 10:35:00,3220.0,0.0,Slight,51.482442,Kensington and Chelsea,Kensington and Chelsea,-0.173862,E01002840,1,False
2005-01-10 21:13:00,0.0,0.0,Slight,51.495752,Kensington and Chelsea,Kensington and Chelsea,-0.156618,E01002863,1,False


## Targets
* based on : https://www.kaggle.com/yesterdog/eda-of-1-6-mil-traffic-accidents-in-london
* Accidents by LSOA (region), by road, by latlong (rounded)...

In [14]:
# Identifying the worst districts to travel.
### https://stackoverflow.com/questions/19384532/how-to-count-number-of-rows-per-group-and-other-statistics-in-pandas-group-by
### https://stackoverflow.com/questions/32012012/pandas-resample-timeseries-with-groupby/39186403#39186403

lsoa_wise = df.groupby( 'LSOA_of_Accident_Location').resample("M").agg({"Number_of_Casualties":"sum","serious_accident":"sum",
                                                                        "Accident_Severity":"count",

#                                                                         "Latitude":scipy.stats.mode,"Longitude":scipy.stats.mode
#                                                                         "Latitude":"mean","Longitude":"mean" # we get missing latLong when no accidents occured, and their locations can change unless we use mode!
                                                                       })
lsoa_wise.rename(columns={"Accident_Severity":"Accident_counts"},inplace=True)
lsoa_wise["percent_seriousAccidents"] = 100*lsoa_wise["serious_accident"]/lsoa_wise["Accident_counts"].round(2)
lsoa_wise.loc[lsoa_wise['percent_seriousAccidents'].isna(), 'percent_seriousAccidents'] = 0
print(lsoa_wise.shape)
lsoa_wise.head()

(4546807, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Number_of_Casualties,serious_accident,Accident_counts,percent_seriousAccidents
LSOA_of_Accident_Location,Date_Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E01000001,2005-01-31,1,1,1,100.0
E01000001,2005-02-28,0,0,0,0.0
E01000001,2005-03-31,3,1,2,50.0
E01000001,2005-04-30,0,0,0,0.0
E01000001,2005-05-31,2,1,1,100.0


In [15]:
lsoa_wise.describe()

Unnamed: 0,Number_of_Casualties,serious_accident,Accident_counts,percent_seriousAccidents
count,4546807.0,4546807.0,4546807.0,4546807.0
mean,0.5286736,0.05787446,0.391363,4.232627
std,1.175227,0.2502898,0.7764849,18.8695
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0
max,96.0,11.0,41.0,100.0


In [16]:
lsoa_wise.to_csv("uk_accidents_lsoa_monthly.csv.gz",compression="gzip")

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # Importing train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Encode the categorical column
encoder = LabelEncoder()
df['Local_Authority_(District)_encoded'] = encoder.fit_transform(df['Local_Authority_(District)'])

# Define the features and target variable
X = df[['Local_Authority_(District)_encoded', 'Number_of_Casualties']]  # Simplified feature set
y = df['serious_accident']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train logistic regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))




Accuracy: 0.8494556075680327
Classification Report:
               precision    recall  f1-score   support

       False       0.85      1.00      0.92    325722
        True       0.52      0.00      0.00     57733

    accuracy                           0.85    383455
   macro avg       0.69      0.50      0.46    383455
weighted avg       0.80      0.85      0.78    383455

