## Predict Accident Risk Score

#### Install Python Packages

In [None]:
# !pip install category_encoders
# !pip install xgboost
# !pip install lightgbm

#### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import category_encoders as enc
from datetime import datetime
from tqdm import tqdm_notebook

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
import re
import math
import gc

warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [None]:
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

#### Utility Functions

In [None]:
def show_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

def join_df(left, right, left_on, right_on=None, suffix='_y'):
    if right_on is None: right_on = left_on
    return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("",suffix))

def get_feature_importance(clf, dataframe, feature_list, top_n=10):
  res = pd.DataFrame({'Feature':dataframe[feature_list].columns, 'Importance': clf.feature_importances_})
  return res.sort_values('Importance', ascending=False).head(top_n)

def generate_date_features(df, fldname, drop=False, time=False):
  fld = df[fldname]
  fld_dtype = fld.dtype
  if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
    fld_dtype = np.datetime64
    
  if not np.issubdtype(fld_dtype, np.datetime64):
    df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
  targ_pre = re.sub("[Dd]ate$", '', fldname)
  attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 
         'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
  
  if time: attr = attr + ['Hour', 'Minute', 'Second']

  for n in attr: df[targ_pre + '_' + n] = getattr(fld.dt, n.lower())
  df[targ_pre+"_Is_weekend"] = df[targ_pre+"_Dayofweek"].apply(lambda x:True if x==5 or x==6 else False)
  df[targ_pre+"_Elapsed"] = fld.astype(np.int64) // 10**9
  if drop: df.drop(fldname, axis=1, inplace=True)
  return df

def get_area_code(text: str):
  text_list = text.split(" ")
  if len(text_list) > 1:
    return text_list[0] + text_list[1][0]
  else:
    return text_list[0]

def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm_notebook(df.columns):
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def run_clf(clf, fit_params, train_data, test_data, features, TARGET_COL, folds = 5):
    
    oofs = np.zeros(len(train_data))
    test_predictions = np.zeros((len(test_data)))
    model_scores = []
    y = train_data[TARGET_COL]
    
    for fold in range(folds):
        
      print(f'\n------------- Fold {fold + 1} -------------')

      xtrain = train_data[train_data[FOLD_COL] != fold]
      xvalid = train_data[train_data[FOLD_COL] == fold]
      xtest = test_data.copy()

      ytrain = xtrain[TARGET_COL]
      yvalid = xvalid[TARGET_COL]

      val_idx = xvalid.index.values
            
      xtrain = xtrain.reset_index(drop=True)
      xvalid = xvalid.reset_index(drop=True)
      
      xtrain = xtrain[features]
      xvalid = xvalid[features]
      xtest  = xtest[features]

      ############# Fitting and Predicting ################

      _ = clf.fit(xtrain
                  , ytrain
                  , eval_set=[(xtrain, ytrain), (xvalid, yvalid)]
                  , **fit_params
                  )

      preds_val = clf.predict(xvalid)
      preds_test = clf.predict(xtest)

      fold_score = np.round(metrics.mean_squared_error(yvalid, preds_val, squared=False), 5)
      model_scores.append(fold_score)
      print(f'\nRMSE for validation set is {fold_score}\n')

      # print(preds_test[0:10])
      
      oofs[val_idx] = preds_val
      test_predictions += preds_test / folds

    print(f'\nCV RMSE score: {np.round(np.mean(model_scores), 5)}')
    
    oofs_score = np.round(metrics.mean_squared_error(y, oofs, squared=False), 5)
    print(f'\nOOF RMSE score: {oofs_score}')

    return test_predictions, model_scores, clf, oofs

#### Reading Data

In [None]:
DIRECTORY_PATH = '/content/data'
TRAIN_CSV = DIRECTORY_PATH + '/train.csv'
TEST_CSV = DIRECTORY_PATH + '/test.csv'
ROADS_CSV = DIRECTORY_PATH + '/roads_network.csv'
POPULATION_CSV = DIRECTORY_PATH + '/population.csv'
SUBMISSION_CSV = DIRECTORY_PATH + '/sample_submission.csv'

In [None]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)

df_roads = pd.read_csv(ROADS_CSV)
df_population = pd.read_csv(POPULATION_CSV)

df_submission = pd.read_csv(SUBMISSION_CSV)

In [None]:
df_train.shape, df_test.shape, df_submission.shape, df_roads.shape, df_population.shape

((478741, 27), (121259, 27), (49772, 2), (91566, 8), (8035, 10))

In [None]:
show_all(df_train.head())

Unnamed: 0,Accident_ID,Police_Force,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,state,postcode,country
0,1,34,2,1,19/12/12,7,13:20,344,E10000032,4,395,Single carriageway,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,Ol or diesel,,1,Yes,England,OX3 9UP,United Kingdom
1,2,5,2,1,02/11/12,4,7:53,102,E09000026,3,13,One way street,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Raining without high winds,Dry,,,1,No,England,S35 4EZ,United Kingdom
2,3,1,2,1,02/11/12,4,16:00,531,E10000016,6,8,Roundabout,40,6,0,None within 50 metres,Zebra crossing,Daylight: Street light present,Fine without high winds,Dry,,,1,No,England,BN21 2XR,United Kingdom
3,4,1,1,1,06/05/12,1,16:50,7,E08000035,6,13,Single carriageway,30,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,Roadworks,,1,Yes,England,TA20 3PT,United Kingdom
4,5,46,1,1,30/06/12,3,13:25,519,E10000031,3,24,Dual carriageway,30,6,0,None within 50 metres,Zebra crossing,Daylight: Street light present,Fine without high winds,Dry,,,1,No,England,DN20 0QF,United Kingdom


In [None]:
show_all(df_test.head())

Unnamed: 0,Accident_ID,Police_Force,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,state,postcode,country
0,14,13,2,0,06/10/13,6,13:28,218,E10000032,4,6358,Single carriageway,60,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Snowing without high winds,Dry,,,2,Yes,England,HX2 8WH,United Kingdom
1,17,13,2,0,22/04/13,7,9:30,157,E10000034,6,29,Single carriageway,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,No,England,RM8 1DD,United Kingdom
2,21,13,2,0,27/09/13,3,19:10,155,E09000012,3,5376,Roundabout,40,3,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Wet/Damp,,,1,Yes,England,SE23 1NH,United Kingdom
3,23,13,2,0,13/03/13,4,9:19,26,E10000016,4,1252,Single carriageway,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,England,HU10 7QS,United Kingdom
4,28,14,2,0,13/06/13,1,14:59,6,E08000012,4,1202,Single carriageway,30,3,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,England,BD23 5JL,United Kingdom


In [None]:
df_submission.head()

Unnamed: 0,postcode,Accident_risk_index
0,AB10 1AU,0
1,AB10 1PG,0
2,AB10 1TT,0
3,AB10 1YP,0
4,AB10 6LQ,0


In [None]:
df_population.head()

Unnamed: 0,postcode,Rural Urban,Variable: All usual residents; measures: Value,Variable: Males; measures: Value,Variable: Females; measures: Value,Variable: Lives in a household; measures: Value,Variable: Lives in a communal establishment; measures: Value,Variable: Schoolchild or full-time student aged 4 and over at their non term-time address; measures: Value,Variable: Area (Hectares); measures: Value,Variable: Density (number of persons per hectare); measures: Value
0,AL1 1,Total,5453,2715,2738,5408,45,75,225.63,24.2
1,AL1 2,Total,6523,3183,3340,6418,105,77,286.59,22.8
2,AL1 3,Total,4179,2121,2058,4100,79,46,97.12,43.0
3,AL1 4,Total,9799,4845,4954,9765,34,285,244.75,40.0
4,AL1 5,Total,10226,5129,5097,10211,15,133,200.93,50.9


In [None]:
## Cleaning Population data
df_population.columns = ['areacode', 'rural_urban', 'residents', 'males', 'females', 'household_pop', 
                         'communal_pop', 'children', 'area', 'pop_density']
df_population = df_population.drop('rural_urban', axis=1)
df_population['areacode'] = df_population['areacode'].apply(lambda x: x.replace(" ", ""))

In [None]:
df_population.head()

Unnamed: 0,areacode,residents,males,females,household_pop,communal_pop,children,area,pop_density
0,AL11,5453,2715,2738,5408,45,75,225.63,24.2
1,AL12,6523,3183,3340,6418,105,77,286.59,22.8
2,AL13,4179,2121,2058,4100,79,46,97.12,43.0
3,AL14,9799,4845,4954,9765,34,285,244.75,40.0
4,AL15,10226,5129,5097,10211,15,133,200.93,50.9


In [None]:
## Adding features -> population data
df_population['male_ratio'] = df_population['males']/df_population['residents']
df_population['female_ratio'] = df_population['females']/df_population['residents']
df_population['child_ratio'] = df_population['children']/df_population['residents']
df_population['household_pop_ratio'] = df_population['household_pop']/df_population['residents']
df_population['communal_pop_ratio'] = df_population['communal_pop']/df_population['residents']

df_population['male_density'] = df_population['males']/df_population['area']
df_population['female_density'] = df_population['females']/df_population['area']
df_population['child_density'] = df_population['children']/df_population['area']
df_population['household_pop_density'] = df_population['household_pop']/df_population['area']
df_population['communal_pop_density'] = df_population['communal_pop']/df_population['area']

df_population['sex_ratio'] = df_population['males']/df_population['females']

In [None]:
show_all(df_population.head())

Unnamed: 0,areacode,residents,males,females,household_pop,communal_pop,children,area,pop_density,male_ratio,female_ratio,child_ratio,household_pop_ratio,communal_pop_ratio,male_density,female_density,child_density,household_pop_density,communal_pop_density,sex_ratio
0,AL11,5453,2715,2738,5408,45,75,225.63,24.2,0.497891,0.502109,0.013754,0.991748,0.008252,12.032974,12.134911,0.332403,23.968444,0.199442,0.9916
1,AL12,6523,3183,3340,6418,105,77,286.59,22.8,0.487966,0.512034,0.011804,0.983903,0.016097,11.106459,11.65428,0.268677,22.394361,0.366377,0.952994
2,AL13,4179,2121,2058,4100,79,46,97.12,43.0,0.507538,0.492462,0.011007,0.981096,0.018904,21.838962,21.19028,0.473641,42.215815,0.813427,1.030612
3,AL14,9799,4845,4954,9765,34,285,244.75,40.0,0.494438,0.505562,0.029085,0.99653,0.00347,19.79571,20.241062,1.164454,39.897855,0.138917,0.977998
4,AL15,10226,5129,5097,10211,15,133,200.93,50.9,0.501565,0.498435,0.013006,0.998533,0.001467,25.526303,25.367043,0.661922,50.818693,0.074653,1.006278


In [None]:
ID_COL = 'Accident_ID'
TARGET_COL = 'Number_of_Casualties'
RANDOM_STATE = 19920803
FOLD_COL = 'Kfold'
N_TRAILS = 10
N_FOLDS = 10

In [None]:
## Combining train and test datasets
df_full = pd.concat([df_train, df_test], axis=0)
df_full['is_train'] = np.where(df_full['Number_of_Casualties']==0, 0, 1)
df_full['Time'] = df_full['Time'].fillna('16:00')
df_full.shape

(600000, 28)

In [None]:
df_full['areacode'] = df_full['postcode'].apply(lambda x: get_area_code(x))

In [None]:
df_full = join_df(df_full, df_population, left_on='areacode')
df_full.shape

(600000, 48)

In [None]:
show_all(df_full.head())

Unnamed: 0,Accident_ID,Police_Force,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,state,postcode,country,is_train,areacode,residents,males,females,household_pop,communal_pop,children,area,pop_density,male_ratio,female_ratio,child_ratio,household_pop_ratio,communal_pop_ratio,male_density,female_density,child_density,household_pop_density,communal_pop_density,sex_ratio
0,1,34,2,1,19/12/12,7,13:20,344,E10000032,4,395,Single carriageway,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,Ol or diesel,,1,Yes,England,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683
1,2,5,2,1,02/11/12,4,7:53,102,E09000026,3,13,One way street,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Raining without high winds,Dry,,,1,No,England,S35 4EZ,United Kingdom,1,S354,6360.0,3038.0,3322.0,6341.0,19.0,43.0,265.07,24.0,0.477673,0.522327,0.006761,0.997013,0.002987,11.461123,12.532539,0.162221,23.921983,0.071679,0.914509
2,3,1,2,1,02/11/12,4,16:00,531,E10000016,6,8,Roundabout,40,6,0,None within 50 metres,Zebra crossing,Daylight: Street light present,Fine without high winds,Dry,,,1,No,England,BN21 2XR,United Kingdom,1,BN212,9050.0,4301.0,4749.0,8492.0,558.0,115.0,327.07,27.7,0.475249,0.524751,0.012707,0.938343,0.061657,13.15009,14.519828,0.351607,25.963861,1.706057,0.905664
3,4,1,1,1,06/05/12,1,16:50,7,E08000035,6,13,Single carriageway,30,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,Roadworks,,1,Yes,England,TA20 3PT,United Kingdom,1,TA203,2690.0,1326.0,1364.0,2685.0,5.0,43.0,6483.99,0.4,0.492937,0.507063,0.015985,0.998141,0.001859,0.204504,0.210364,0.006632,0.414097,0.000771,0.972141
4,5,46,1,1,30/06/12,3,13:25,519,E10000031,3,24,Dual carriageway,30,6,0,None within 50 metres,Zebra crossing,Daylight: Street light present,Fine without high winds,Dry,,,1,No,England,DN20 0QF,United Kingdom,1,DN200,6875.0,3338.0,3537.0,6754.0,121.0,82.0,8404.23,0.8,0.485527,0.514473,0.011927,0.9824,0.0176,0.397181,0.420859,0.009757,0.803643,0.014398,0.943738


In [None]:
## Generating features from Timestamp
df_full['Accident_ts'] = df_full['Date']+' '+df_full['Time']
df_full['Accident_ts'] = pd.to_datetime(df_full['Accident_ts'], format='%d/%m/%y %H:%M')
df_full['First_Date'] = df_full.groupby(['postcode'])['Accident_ts'].transform('min')
df_full['Days_Since_First_Accident'] = (df_full['Accident_ts'] - df_full['First_Date']).dt.days

df_full = generate_date_features(df_full, fldname='Accident_ts', time=True)
df_full.shape

(600000, 68)

In [None]:
df_full['postcode'].nunique(), df_full['areacode'].nunique()

(99689, 9421)

In [None]:
show_all(df_full[df_full['postcode']=='OX3 9UP'])

Unnamed: 0,Accident_ID,Police_Force,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,state,postcode,country,is_train,areacode,residents,males,females,household_pop,communal_pop,children,area,pop_density,male_ratio,female_ratio,child_ratio,household_pop_ratio,communal_pop_ratio,male_density,female_density,child_density,household_pop_density,communal_pop_density,sex_ratio,Accident_ts,First_Date,Days_Since_First_Accident,Accident_ts_Year,Accident_ts_Month,Accident_ts_Week,Accident_ts_Day,Accident_ts_Dayofweek,Accident_ts_Dayofyear,Accident_ts_Is_month_end,Accident_ts_Is_month_start,Accident_ts_Is_quarter_end,Accident_ts_Is_quarter_start,Accident_ts_Is_year_end,Accident_ts_Is_year_start,Accident_ts_Hour,Accident_ts_Minute,Accident_ts_Second,Accident_ts_Is_weekend,Accident_ts_Elapsed
0,1,34,2,1,19/12/12,7,13:20,344,E10000032,4,395,Single carriageway,30,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,Ol or diesel,,1,Yes,England,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-12-19 13:20:00,2012-01-14 19:20:00,339,2012,12,51,19,2,354,False,False,False,False,False,False,13,20,0,False,1355923200
91385,114321,1,1,1,26/04/12,4,13:30,728,S12000041,6,5,Single carriageway,30,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Wet/Damp,,,1,Yes,Cymru / Wales,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-04-26 13:30:00,2012-01-14 19:20:00,102,2012,4,17,26,3,117,False,False,False,False,False,False,13,30,0,False,1335447000
121414,151919,21,2,3,17/02/12,1,14:40,325,E06000029,4,353,Dual carriageway,70,-1,0,None within 50 metres,No physical crossing within 50 meters,Darkness: Street lights present and lit,Fine without high winds,Dry,,,2,Yes,England,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-02-17 14:40:00,2012-01-14 19:20:00,33,2012,2,7,17,4,48,False,False,False,False,False,False,14,40,0,False,1329489600
180653,226233,34,2,2,14/01/12,6,19:20,347,E08000032,3,308,Single carriageway,60,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,2,Yes,England,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-01-14 19:20:00,2012-01-14 19:20:00,0,2012,1,2,14,5,14,False,False,False,False,False,False,19,20,0,True,1326568800
210141,263194,62,2,1,22/09/12,4,8:42,778,E10000017,6,335,Single carriageway,30,6,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,Cymru / Wales,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-09-22 08:42:00,2012-01-14 19:20:00,251,2012,9,38,22,5,266,False,False,False,False,False,False,8,42,0,True,1348303320
337835,423353,93,4,3,26/05/12,3,8:53,922,W06000015,3,325,Single carriageway,70,-1,0,None within 50 metres,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,2,Yes,Alba / Scotland,OX3 9UP,United Kingdom,1,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2012-05-26 08:53:00,2012-01-14 19:20:00,132,2012,5,21,26,5,147,False,False,False,False,False,False,8,53,0,True,1338022380
525113,229950,95,2,0,20/06/13,5,17:25,928,E06000039,3,485,Dual carriageway,30,6,0,None within 50 metres,No physical crossing within 50 meters,Darkness: Street lights present and lit,Raining with high winds,Dry,,,1,No,Alba / Scotland,OX3 9UP,United Kingdom,0,OX39,8929.0,4268.0,4661.0,8855.0,74.0,142.0,2575.85,3.5,0.477993,0.522007,0.015903,0.991712,0.008288,1.656929,1.8095,0.055127,3.4377,0.028728,0.915683,2013-06-20 17:25:00,2012-01-14 19:20:00,522,2013,6,25,20,3,171,False,False,False,False,False,False,17,25,0,False,1371749100


In [None]:
## Filling NA values for categorical columns by mode
df_full['Road_Surface_Conditions'] = df_full['Road_Surface_Conditions'].fillna(df_full['Road_Surface_Conditions'].mode()[0])
df_full['Special_Conditions_at_Site'] = df_full['Special_Conditions_at_Site'].fillna(df_full['Special_Conditions_at_Site'].mode()[0])

In [None]:
## Checking NA values in dataframe
show_all(df_full.isna().sum())

Accident_ID                                        0
Police_Force                                       0
Number_of_Vehicles                                 0
Number_of_Casualties                               0
Date                                               0
Day_of_Week                                        0
Time                                               0
Local_Authority_(District)                         0
Local_Authority_(Highway)                          0
1st_Road_Class                                     0
1st_Road_Number                                    0
Road_Type                                          0
Speed_limit                                        0
2nd_Road_Class                                     0
2nd_Road_Number                                    0
Pedestrian_Crossing-Human_Control                  0
Pedestrian_Crossing-Physical_Facilities            0
Light_Conditions                                   0
Weather_Conditions                            

In [None]:
def time_of_day(hour):
    if hour >= 5 and hour < 10:
        return "1"
    elif hour >= 10 and hour < 15:
        return "2"
    elif hour >= 15 and hour < 19:
        return "3"
    elif hour >= 19 and hour < 23:
        return "4"
    else:
        return "5"

In [None]:
df_full['day_group'] = df_full['Accident_ts_Hour'].apply(lambda x: time_of_day(x))

In [None]:
na_cols = ['residents', 'males', 'females', 'household_pop',
            'communal_pop', 'children', 'area', 'pop_density', 'male_ratio',
            'female_ratio', 'child_ratio', 'household_pop_ratio',
            'communal_pop_ratio', 'male_density', 'female_density', 'child_density',
            'household_pop_density', 'communal_pop_density', 'sex_ratio'
           ]

In [None]:
## filling NA values by median
df_full[na_cols] = df_full[na_cols].fillna(df_full[na_cols].median())

In [None]:
df_full.columns

Index(['Accident_ID', 'Police_Force', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Date', 'Day_of_Week', 'Time',
       'Local_Authority_(District)', 'Local_Authority_(Highway)',
       '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit',
       '2nd_Road_Class', '2nd_Road_Number',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards',
       'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident',
       'state', 'postcode', 'country', 'is_train', 'areacode', 'residents',
       'males', 'females', 'household_pop', 'communal_pop', 'children', 'area',
       'pop_density', 'male_ratio', 'female_ratio', 'child_ratio',
       'household_pop_ratio', 'communal_pop_ratio', 'male_density',
       'female_density', 'child_density', 'household_pop_density',
       'communal_pop_density', 

In [None]:
unwanted_cols = [ID_COL, 
                 TARGET_COL, 
                 FOLD_COL, 
                 'is_train', 
                 '2nd_Road_Number',
                 'country',
                 'Accident_ts_Second',
                 'Accident_ts_Year'
                 ]

date_cols = ['Date', 
             'Time', 
             'Accident_ts', 
             'First_Date',
             ]

num_cols = [
 'Police_Force',
 'Number_of_Vehicles',
 '1st_Road_Number',
 'Speed_limit',
 'Days_Since_First_Accident',
 ]

object_cols = [
 'Day_of_Week',
 'Local_Authority_(District)',
 'Local_Authority_(Highway)',
 '1st_Road_Class',
 'Road_Type',
 '2nd_Road_Class',
 'Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Special_Conditions_at_Site',
 'Carriageway_Hazards',
 'Urban_or_Rural_Area',
 'Did_Police_Officer_Attend_Scene_of_Accident',
 'state',
 'postcode',
 'day_group',
 'areacode'
 ]

len(num_cols), len(object_cols)

(5, 19)

In [None]:
## Adding aggregate feats
for ocol in tqdm_notebook([
                          'Day_of_Week',
                          'Local_Authority_(District)',
                          'Local_Authority_(Highway)',
                          '1st_Road_Class',
                          'Road_Type',
                          'postcode',
                          'day_group',
                          ]):
  print(ocol)
  for ncol in num_cols:
    df_full[f"{ocol}_{ncol}_min"] = df_full.groupby([ocol])[ncol].transform('min')
    df_full[f"{ocol}_{ncol}_max"] = df_full.groupby([ocol])[ncol].transform('max')
    df_full[f"{ocol}_{ncol}_avg"] = df_full.groupby([ocol])[ncol].transform('mean')
    df_full[f"{ocol}_{ncol}_median"] = df_full.groupby([ocol])[ncol].transform('median')
    df_full[f"{ocol}_{ncol}_sum"] = df_full.groupby([ocol])[ncol].transform('sum')
    df_full[f"{ocol}_{ncol}_diff"] = df_full[ncol] - df_full[f"{ocol}_{ncol}_avg"]

  0%|          | 0/7 [00:00<?, ?it/s]

Day_of_Week
Local_Authority_(District)
Local_Authority_(Highway)
1st_Road_Class
Road_Type
postcode
day_group


In [None]:
show_all(df_full.shape)

(600000, 279)

In [None]:
df_full['accidents_postcode_count'] = df_full.groupby(['postcode'])['Accident_ID'].transform('count')

df_full['days_postcode_unique'] = df_full.groupby(['postcode'])['Date'].transform('nunique')

for col in tqdm_notebook(object_cols):
  df_full[f'{col}_unq'] = df_full.groupby([col])['postcode'].transform('nunique')
  df_full[f'{col}_cnt'] = df_full.groupby([col])['postcode'].transform('count')

  0%|          | 0/19 [00:00<?, ?it/s]

In [None]:
## Splitting into train and test datasets
train_df = df_full[df_full['is_train']==1]
test_df = df_full[df_full['is_train']==0]
train_df.shape, test_df.shape

((478741, 319), (121259, 319))

In [None]:
## Adding Kfolds to train data
kf = model_selection.KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X=train_df)):
    train_df.loc[valid_idx, FOLD_COL] = fold

In [None]:
train_df[FOLD_COL].value_counts()

0.0    47875
5.0    47874
2.0    47874
8.0    47874
1.0    47874
7.0    47874
3.0    47874
6.0    47874
4.0    47874
9.0    47874
Name: Kfold, dtype: int64

#### Target encoding - Categorical columns

In [None]:
%%time
# Target Encoding
X = train_df[object_cols]
y = train_df[TARGET_COL].values

enc_train = np.zeros(X.shape)
smoothing = 0.3

Folds = model_selection.KFold(n_splits=N_FOLDS, random_state=RANDOM_STATE, shuffle=True)

for train_idx, valid_idx in Folds.split(X):
    encoder = enc.TargetEncoder(cols=object_cols, smoothing=smoothing)
  
    encoder.fit(X.loc[train_idx], y[train_idx])
    enc_train[valid_idx, :] = encoder.transform(X.iloc[valid_idx], y[valid_idx])

encoder.fit(X, y)
enc_test = encoder.transform(test_df[object_cols]).values

for idx, col in enumerate(object_cols):
    col = 'tE_' + col 
    train_df[col] = enc_train[:,idx]
    test_df[col] = enc_test[:, idx]

CPU times: user 1min 56s, sys: 2.11 s, total: 1min 58s
Wall time: 1min 57s


In [None]:
## final feature set
features = [col for col in train_df.columns if col not in unwanted_cols + object_cols + date_cols]
len(features)

308

In [None]:
train_df_low_mem = reduce_memory_usage(train_df)
test_df_low_mem = reduce_memory_usage(test_df)

  0%|          | 0/339 [00:00<?, ?it/s]

Mem. usage decreased to 401.01 Mb (67.5% reduction)


  0%|          | 0/338 [00:00<?, ?it/s]

Mem. usage decreased to 97.25 Mb (68.4% reduction)


In [None]:
fit_params = {'verbose': 500, 'early_stopping_rounds': 200}

In [None]:
show_all(train_df_low_mem[features].head())

Unnamed: 0,Police_Force,Number_of_Vehicles,1st_Road_Number,Speed_limit,residents,males,females,household_pop,communal_pop,children,area,pop_density,male_ratio,female_ratio,child_ratio,household_pop_ratio,communal_pop_ratio,male_density,female_density,child_density,household_pop_density,communal_pop_density,sex_ratio,Days_Since_First_Accident,Accident_ts_Month,Accident_ts_Week,Accident_ts_Day,Accident_ts_Dayofweek,Accident_ts_Dayofyear,Accident_ts_Is_month_end,Accident_ts_Is_month_start,Accident_ts_Is_quarter_end,Accident_ts_Is_quarter_start,Accident_ts_Is_year_end,Accident_ts_Is_year_start,Accident_ts_Hour,Accident_ts_Minute,Accident_ts_Is_weekend,Accident_ts_Elapsed,Day_of_Week_Police_Force_min,Day_of_Week_Police_Force_max,Day_of_Week_Police_Force_avg,Day_of_Week_Police_Force_median,Day_of_Week_Police_Force_sum,Day_of_Week_Police_Force_diff,Day_of_Week_Number_of_Vehicles_min,Day_of_Week_Number_of_Vehicles_max,Day_of_Week_Number_of_Vehicles_avg,Day_of_Week_Number_of_Vehicles_median,Day_of_Week_Number_of_Vehicles_sum,Day_of_Week_Number_of_Vehicles_diff,Day_of_Week_1st_Road_Number_min,Day_of_Week_1st_Road_Number_max,Day_of_Week_1st_Road_Number_avg,Day_of_Week_1st_Road_Number_median,Day_of_Week_1st_Road_Number_sum,Day_of_Week_1st_Road_Number_diff,Day_of_Week_Speed_limit_min,Day_of_Week_Speed_limit_max,Day_of_Week_Speed_limit_avg,Day_of_Week_Speed_limit_median,Day_of_Week_Speed_limit_sum,Day_of_Week_Speed_limit_diff,Day_of_Week_Days_Since_First_Accident_min,Day_of_Week_Days_Since_First_Accident_max,Day_of_Week_Days_Since_First_Accident_avg,Day_of_Week_Days_Since_First_Accident_median,Day_of_Week_Days_Since_First_Accident_sum,Day_of_Week_Days_Since_First_Accident_diff,Local_Authority_(District)_Police_Force_min,Local_Authority_(District)_Police_Force_max,Local_Authority_(District)_Police_Force_avg,Local_Authority_(District)_Police_Force_median,Local_Authority_(District)_Police_Force_sum,Local_Authority_(District)_Police_Force_diff,Local_Authority_(District)_Number_of_Vehicles_min,Local_Authority_(District)_Number_of_Vehicles_max,Local_Authority_(District)_Number_of_Vehicles_avg,Local_Authority_(District)_Number_of_Vehicles_median,Local_Authority_(District)_Number_of_Vehicles_sum,Local_Authority_(District)_Number_of_Vehicles_diff,Local_Authority_(District)_1st_Road_Number_min,Local_Authority_(District)_1st_Road_Number_max,Local_Authority_(District)_1st_Road_Number_avg,Local_Authority_(District)_1st_Road_Number_median,Local_Authority_(District)_1st_Road_Number_sum,Local_Authority_(District)_1st_Road_Number_diff,Local_Authority_(District)_Speed_limit_min,Local_Authority_(District)_Speed_limit_max,Local_Authority_(District)_Speed_limit_avg,Local_Authority_(District)_Speed_limit_median,Local_Authority_(District)_Speed_limit_sum,Local_Authority_(District)_Speed_limit_diff,Local_Authority_(District)_Days_Since_First_Accident_min,Local_Authority_(District)_Days_Since_First_Accident_max,Local_Authority_(District)_Days_Since_First_Accident_avg,Local_Authority_(District)_Days_Since_First_Accident_median,Local_Authority_(District)_Days_Since_First_Accident_sum,Local_Authority_(District)_Days_Since_First_Accident_diff,Local_Authority_(Highway)_Police_Force_min,Local_Authority_(Highway)_Police_Force_max,Local_Authority_(Highway)_Police_Force_avg,Local_Authority_(Highway)_Police_Force_median,Local_Authority_(Highway)_Police_Force_sum,Local_Authority_(Highway)_Police_Force_diff,Local_Authority_(Highway)_Number_of_Vehicles_min,Local_Authority_(Highway)_Number_of_Vehicles_max,Local_Authority_(Highway)_Number_of_Vehicles_avg,Local_Authority_(Highway)_Number_of_Vehicles_median,Local_Authority_(Highway)_Number_of_Vehicles_sum,Local_Authority_(Highway)_Number_of_Vehicles_diff,Local_Authority_(Highway)_1st_Road_Number_min,Local_Authority_(Highway)_1st_Road_Number_max,Local_Authority_(Highway)_1st_Road_Number_avg,Local_Authority_(Highway)_1st_Road_Number_median,Local_Authority_(Highway)_1st_Road_Number_sum,Local_Authority_(Highway)_1st_Road_Number_diff,Local_Authority_(Highway)_Speed_limit_min,Local_Authority_(Highway)_Speed_limit_max,Local_Authority_(Highway)_Speed_limit_avg,Local_Authority_(Highway)_Speed_limit_median,Local_Authority_(Highway)_Speed_limit_sum,Local_Authority_(Highway)_Speed_limit_diff,Local_Authority_(Highway)_Days_Since_First_Accident_min,Local_Authority_(Highway)_Days_Since_First_Accident_max,Local_Authority_(Highway)_Days_Since_First_Accident_avg,Local_Authority_(Highway)_Days_Since_First_Accident_median,Local_Authority_(Highway)_Days_Since_First_Accident_sum,Local_Authority_(Highway)_Days_Since_First_Accident_diff,1st_Road_Class_Police_Force_min,1st_Road_Class_Police_Force_max,1st_Road_Class_Police_Force_avg,1st_Road_Class_Police_Force_median,1st_Road_Class_Police_Force_sum,1st_Road_Class_Police_Force_diff,1st_Road_Class_Number_of_Vehicles_min,1st_Road_Class_Number_of_Vehicles_max,1st_Road_Class_Number_of_Vehicles_avg,1st_Road_Class_Number_of_Vehicles_median,1st_Road_Class_Number_of_Vehicles_sum,1st_Road_Class_Number_of_Vehicles_diff,1st_Road_Class_1st_Road_Number_min,1st_Road_Class_1st_Road_Number_max,1st_Road_Class_1st_Road_Number_avg,1st_Road_Class_1st_Road_Number_median,1st_Road_Class_1st_Road_Number_sum,1st_Road_Class_1st_Road_Number_diff,1st_Road_Class_Speed_limit_min,1st_Road_Class_Speed_limit_max,1st_Road_Class_Speed_limit_avg,1st_Road_Class_Speed_limit_median,1st_Road_Class_Speed_limit_sum,1st_Road_Class_Speed_limit_diff,1st_Road_Class_Days_Since_First_Accident_min,1st_Road_Class_Days_Since_First_Accident_max,1st_Road_Class_Days_Since_First_Accident_avg,1st_Road_Class_Days_Since_First_Accident_median,1st_Road_Class_Days_Since_First_Accident_sum,1st_Road_Class_Days_Since_First_Accident_diff,Road_Type_Police_Force_min,Road_Type_Police_Force_max,Road_Type_Police_Force_avg,Road_Type_Police_Force_median,Road_Type_Police_Force_sum,Road_Type_Police_Force_diff,Road_Type_Number_of_Vehicles_min,Road_Type_Number_of_Vehicles_max,Road_Type_Number_of_Vehicles_avg,Road_Type_Number_of_Vehicles_median,Road_Type_Number_of_Vehicles_sum,Road_Type_Number_of_Vehicles_diff,Road_Type_1st_Road_Number_min,Road_Type_1st_Road_Number_max,Road_Type_1st_Road_Number_avg,Road_Type_1st_Road_Number_median,Road_Type_1st_Road_Number_sum,Road_Type_1st_Road_Number_diff,Road_Type_Speed_limit_min,Road_Type_Speed_limit_max,Road_Type_Speed_limit_avg,Road_Type_Speed_limit_median,Road_Type_Speed_limit_sum,Road_Type_Speed_limit_diff,Road_Type_Days_Since_First_Accident_min,Road_Type_Days_Since_First_Accident_max,Road_Type_Days_Since_First_Accident_avg,Road_Type_Days_Since_First_Accident_median,Road_Type_Days_Since_First_Accident_sum,Road_Type_Days_Since_First_Accident_diff,postcode_Police_Force_min,postcode_Police_Force_max,postcode_Police_Force_avg,postcode_Police_Force_median,postcode_Police_Force_sum,postcode_Police_Force_diff,postcode_Number_of_Vehicles_min,postcode_Number_of_Vehicles_max,postcode_Number_of_Vehicles_avg,postcode_Number_of_Vehicles_median,postcode_Number_of_Vehicles_sum,postcode_Number_of_Vehicles_diff,postcode_1st_Road_Number_min,postcode_1st_Road_Number_max,postcode_1st_Road_Number_avg,postcode_1st_Road_Number_median,postcode_1st_Road_Number_sum,postcode_1st_Road_Number_diff,postcode_Speed_limit_min,postcode_Speed_limit_max,postcode_Speed_limit_avg,postcode_Speed_limit_median,postcode_Speed_limit_sum,postcode_Speed_limit_diff,postcode_Days_Since_First_Accident_min,postcode_Days_Since_First_Accident_max,postcode_Days_Since_First_Accident_avg,postcode_Days_Since_First_Accident_median,postcode_Days_Since_First_Accident_sum,postcode_Days_Since_First_Accident_diff,day_group_Police_Force_min,day_group_Police_Force_max,day_group_Police_Force_avg,day_group_Police_Force_median,day_group_Police_Force_sum,day_group_Police_Force_diff,day_group_Number_of_Vehicles_min,day_group_Number_of_Vehicles_max,day_group_Number_of_Vehicles_avg,day_group_Number_of_Vehicles_median,day_group_Number_of_Vehicles_sum,day_group_Number_of_Vehicles_diff,day_group_1st_Road_Number_min,day_group_1st_Road_Number_max,day_group_1st_Road_Number_avg,day_group_1st_Road_Number_median,day_group_1st_Road_Number_sum,day_group_1st_Road_Number_diff,day_group_Speed_limit_min,day_group_Speed_limit_max,day_group_Speed_limit_avg,day_group_Speed_limit_median,day_group_Speed_limit_sum,day_group_Speed_limit_diff,day_group_Days_Since_First_Accident_min,day_group_Days_Since_First_Accident_max,day_group_Days_Since_First_Accident_avg,day_group_Days_Since_First_Accident_median,day_group_Days_Since_First_Accident_sum,day_group_Days_Since_First_Accident_diff,accidents_postcode_count,days_postcode_unique,Day_of_Week_unq,Day_of_Week_cnt,Local_Authority_(District)_unq,Local_Authority_(District)_cnt,Local_Authority_(Highway)_unq,Local_Authority_(Highway)_cnt,1st_Road_Class_unq,1st_Road_Class_cnt,Road_Type_unq,Road_Type_cnt,2nd_Road_Class_unq,2nd_Road_Class_cnt,Pedestrian_Crossing-Human_Control_unq,Pedestrian_Crossing-Human_Control_cnt,Pedestrian_Crossing-Physical_Facilities_unq,Pedestrian_Crossing-Physical_Facilities_cnt,Light_Conditions_unq,Light_Conditions_cnt,Weather_Conditions_unq,Weather_Conditions_cnt,Road_Surface_Conditions_unq,Road_Surface_Conditions_cnt,Special_Conditions_at_Site_unq,Special_Conditions_at_Site_cnt,Carriageway_Hazards_unq,Carriageway_Hazards_cnt,Urban_or_Rural_Area_unq,Urban_or_Rural_Area_cnt,Did_Police_Officer_Attend_Scene_of_Accident_unq,Did_Police_Officer_Attend_Scene_of_Accident_cnt,state_unq,state_cnt,postcode_unq,postcode_cnt,day_group_unq,day_group_cnt,areacode_unq,areacode_cnt,tE_Day_of_Week,tE_Local_Authority_(District),tE_Local_Authority_(Highway),tE_1st_Road_Class,tE_Road_Type,tE_2nd_Road_Class,tE_Pedestrian_Crossing-Human_Control,tE_Pedestrian_Crossing-Physical_Facilities,tE_Light_Conditions,tE_Weather_Conditions,tE_Road_Surface_Conditions,tE_Special_Conditions_at_Site,tE_Carriageway_Hazards,tE_Urban_or_Rural_Area,tE_Did_Police_Officer_Attend_Scene_of_Accident,tE_state,tE_postcode,tE_day_group,tE_areacode
0,34,2,395,30,8928.0,4268.0,4660.0,8856.0,74.0,142.0,2576.0,3.5,0.478027,0.521973,0.0159,0.991699,0.008286,1.657227,1.80957,0.055115,3.4375,0.028732,0.915527,339,12,51,19,2,354,False,False,False,False,False,False,13,20,False,1355923200,1,98,24.578125,15.0,2354876,9.421875,1,4,1.704102,2.0,163256,0.296143,0,9832,939.5,325.0,90013777,-544.5,20,70,37.09375,30.0,3555100,-7.105469,0,723,196.875,151.0,18864825,142.125,1,95,29.96875,33.0,22028,4.03125,1,4,1.78125,2.0,1309,0.218994,0,9395,1073.0,416.0,788381,-677.5,20,70,42.96875,30.0,31580,-12.96875,0,656,123.125,86.0,90491,215.875,1,96,30.234375,33.0,544583,3.763672,1,4,1.678711,2.0,30229,0.321533,0,9803,1075.0,447.0,19353340,-679.5,20,70,44.125,40.0,794900,-14.132812,0,717,129.125,93.0,2324922,209.875,1,98,25.828125,21.0,2566902,8.164062,1,4,1.833984,2.0,182199,0.16626,0,9832,1538.0,502.0,152814152,-1143.0,20,70,37.0,30.0,3675770,-6.996094,0,723,200.5,158.0,19924971,138.5,1,98,27.0,21.0,12882090,6.996094,1,4,1.727539,2.0,824343,0.271973,0,9832,900.0,160.0,429330416,-505.0,20,70,36.21875,30.0,17280330,-6.222656,0,727,184.625,140.0,88085100,154.375,1,95,48.5625,34.0,340,-14.570312,1,4,2.142578,2.0,15,-0.142822,5,485,315.25,335.0,2206,79.875,30,70,45.71875,30.0,320,-15.710938,0,522,197.0,132.0,1379,142.0,1,98,27.734375,21.0,5581957,6.269531,1,4,1.769531,2.0,356137,0.230835,0,9832,984.0,328.0,198126563,-589.0,20,70,36.125,30.0,7274630,-6.136719,0,727,177.125,133.0,35660114,161.875,7,7,56123,95812,731,735,15958,18011,56717,99358,96838,477049,84193,255784,99115,569874,97337,504070,95618,464633,96082,467111,95216,448670,1556,1575,99200,576563,90440,380682,96709,492180,98088,523528,1,7,77654,201303,15,74,1.459961,1.5,1.486328,1.482422,1.453125,1.475586,1.462891,1.463867,1.467773,1.466797,1.46875,1.523438,1.462891,1.436523,1.467773,1.461914,2.0,1.46875,1.644531
1,5,2,13,30,6360.0,3038.0,3322.0,6340.0,19.0,43.0,265.0,24.0,0.477783,0.522461,0.00676,0.99707,0.002987,11.460938,12.53125,0.162231,23.921875,0.071655,0.914551,246,11,44,2,4,307,False,False,False,False,False,False,7,53,False,1351842780,1,98,28.71875,22.0,1617099,-23.71875,1,4,1.78418,2.0,100456,0.216187,0,9832,1005.0,354.0,56586116,-992.0,20,70,36.25,30.0,2041550,-6.25,0,720,172.5,130.0,9714457,73.5,1,96,9.195312,6.0,11967,-4.199219,1,4,1.908203,2.0,2482,0.092224,0,7351,1510.0,18.0,1963995,-1497.0,20,70,33.125,30.0,43090,-3.121094,0,712,187.0,132.0,243361,58.9375,1,97,12.476562,6.0,33455,-7.480469,1,4,1.866211,2.0,5003,0.133911,0,9326,1462.0,374.0,3920370,-1449.0,20,70,33.5625,30.0,89950,-3.550781,0,710,219.75,173.0,589021,26.296875,1,98,32.34375,22.0,6479986,-27.359375,1,4,1.775391,2.0,355583,0.224609,0,9832,1174.0,451.0,235051256,-1161.0,20,70,39.6875,30.0,7949210,-9.6875,0,724,159.0,111.0,31837015,87.0625,1,98,25.828125,7.0,696395,-20.828125,1,4,1.970703,2.0,53149,0.029251,0,9754,1435.0,430.0,38706272,-1422.0,20,70,33.53125,30.0,903890,-3.515625,0,712,150.125,119.0,4048893,95.875,1,52,15.585938,12.5,187,-10.585938,1,4,2.0,2.0,24,0.0,0,6434,763.5,20.0,9161,-750.5,30,40,30.828125,30.0,370,-0.833496,0,652,417.5,466.5,5009,-171.375,1,98,27.6875,21.0,2760582,-22.6875,1,4,1.744141,2.0,173902,0.255859,0,9832,956.5,332.0,95374034,-943.5,20,70,36.53125,30.0,3643600,-6.542969,0,723,178.375,133.0,17784527,67.625,12,12,39640,56316,1281,1301,2638,2681,75385,200278,21679,26969,84193,255784,99115,569874,97337,504070,95618,464633,47345,76431,95216,448670,98170,535707,99200,576563,90440,380682,59056,107820,98088,523528,1,12,57207,99703,8,29,1.47168,1.456055,1.490234,1.510742,1.486328,1.475586,1.462891,1.463867,1.467773,1.442383,1.46875,1.463867,1.462891,1.435547,1.447266,1.461914,1.499023,1.462891,1.333008
2,1,2,8,40,9048.0,4300.0,4748.0,8496.0,558.0,115.0,327.0,27.703125,0.475342,0.524902,0.012711,0.938477,0.061646,13.148438,14.523438,0.351562,25.96875,1.706055,0.905762,265,11,44,2,4,307,False,False,False,False,False,False,16,0,False,1351872000,1,98,28.71875,22.0,1617099,-27.71875,1,4,1.78418,2.0,100456,0.216187,0,9832,1005.0,354.0,56586116,-997.0,20,70,36.25,30.0,2041550,3.748047,0,720,172.5,130.0,9714457,92.5,1,63,41.90625,44.0,51980,-40.90625,1,4,1.773438,2.0,2199,0.226562,0,9214,1180.0,480.0,1463433,-1172.0,20,70,36.28125,30.0,44990,3.716797,0,689,154.125,127.0,191047,110.9375,1,98,30.28125,43.0,1028900,-29.28125,1,4,1.831055,2.0,62209,0.169189,0,9693,978.0,22.0,33230716,-970.0,20,70,34.5,30.0,1172570,5.492188,0,722,147.25,120.0,5003804,117.75,1,98,24.40625,14.0,5104189,-23.40625,1,4,1.703125,2.0,356072,0.296875,0,9572,457.5,10.0,95653889,-449.5,20,70,31.359375,30.0,6556040,8.640625,0,727,195.375,154.0,40858288,69.5625,1,98,22.296875,13.0,769775,-21.296875,1,4,1.931641,2.0,66704,0.067932,0,9788,1437.0,445.0,49627859,-1429.0,20,70,34.71875,30.0,1198390,5.289062,0,722,172.875,132.0,5967572,92.125,1,95,31.71875,32.0,444,-30.71875,1,3,1.857422,2.0,26,0.142822,0,2228,570.0,16.0,7979,-562.0,20,60,34.28125,30.0,480,5.714844,0,573,219.75,215.0,3076,45.28125,1,98,28.28125,21.0,5149469,-27.28125,1,4,1.756836,2.0,320018,0.242798,0,9832,954.0,300.0,173765221,-946.0,20,70,36.03125,30.0,6564900,3.953125,0,722,174.5,130.0,31782574,90.5,14,14,39640,56316,1226,1240,25839,33979,77281,209071,26915,34525,79320,247729,99115,569874,26848,33895,95618,464633,96082,467111,95216,448670,98170,535707,99200,576563,90440,380682,59056,107820,98088,523528,1,14,74898,182123,12,73,1.469727,1.410156,1.414062,1.407227,1.503906,1.426758,1.461914,1.452148,1.466797,1.466797,1.46875,1.462891,1.462891,1.436523,1.445312,1.460938,1.400391,1.461914,1.301758
3,1,1,13,30,2690.0,1326.0,1364.0,2684.0,5.0,43.0,6484.0,0.399902,0.49292,0.506836,0.015991,0.998047,0.001859,0.204468,0.210327,0.00663,0.414062,0.000771,0.972168,101,5,18,6,6,127,False,False,False,False,False,False,16,50,True,1336323000,1,98,22.1875,14.0,1071647,-21.1875,1,4,1.708984,2.0,82538,-0.708984,0,9832,870.0,28.0,42016676,-857.0,20,70,36.1875,30.0,1748320,-6.203125,0,722,218.125,184.0,10532312,-117.0625,1,63,6.988281,1.0,38356,-5.988281,1,4,1.859375,2.0,10208,-0.859375,0,9218,463.25,12.0,2542779,-450.25,20,70,30.15625,30.0,165560,-0.156616,0,717,282.0,268.5,1547549,-180.875,1,96,29.4375,36.0,401457,-28.4375,1,4,1.708008,2.0,23299,-0.708496,0,9687,960.0,370.0,13094539,-947.0,20,70,39.1875,30.0,534630,-9.195312,0,721,158.75,119.0,2165176,-57.75,1,98,24.40625,14.0,5104189,-23.40625,1,4,1.703125,2.0,356072,-0.703125,0,9572,457.5,10.0,95653889,-444.5,20,70,31.359375,30.0,6556040,-1.358398,0,727,195.375,154.0,40858288,-94.4375,1,98,27.0,21.0,12882090,-26.0,1,4,1.727539,2.0,824343,-0.728027,0,9832,900.0,160.0,429330416,-887.0,20,70,36.21875,30.0,17280330,-6.222656,0,727,184.625,140.0,88085100,-83.625,1,52,25.328125,22.0,304,-24.328125,1,2,1.75,2.0,21,-0.75,3,5275,1184.0,433.5,14204,-1171.0,30,70,39.15625,30.0,470,-9.164062,0,328,136.875,120.5,1643,-35.90625,1,98,28.28125,21.0,5149469,-27.28125,1,4,1.756836,2.0,320018,-0.757324,0,9832,954.0,300.0,173765221,-941.0,20,70,36.03125,30.0,6564900,-6.046875,0,722,174.5,130.0,31782574,-73.5,12,11,35402,48294,5219,5490,12478,13639,77281,209071,96838,477049,79320,247729,99115,569874,97337,504070,95618,464633,96082,467111,95216,448670,40458,57736,99200,576563,90440,380682,96709,492180,98088,523528,1,12,74898,182123,14,91,1.451172,1.336914,1.439453,1.408203,1.452148,1.425781,1.461914,1.462891,1.467773,1.466797,1.46875,1.458984,1.462891,1.436523,1.466797,1.460938,1.727539,1.461914,1.597656
4,46,1,24,30,6876.0,3338.0,3536.0,6752.0,121.0,82.0,8408.0,0.799805,0.485596,0.514648,0.011925,0.982422,0.017593,0.397217,0.420898,0.009758,0.803711,0.014397,0.943848,141,6,26,30,5,182,True,False,True,False,False,False,13,25,True,1341062700,1,98,28.546875,21.0,3358520,17.453125,1,4,1.767578,2.0,207911,-0.767578,0,9832,930.5,286.0,109447494,-906.5,20,70,36.03125,30.0,4236690,-6.019531,0,722,170.875,128.0,20096702,-29.859375,1,96,40.21875,43.0,59343,5.792969,1,4,1.785156,2.0,2635,-0.785156,0,9084,1038.0,410.5,1532685,-1014.5,20,70,36.5625,30.0,53980,-6.570312,0,697,145.625,118.5,214976,-4.648438,1,98,23.3125,15.0,203907,22.6875,1,4,1.682617,2.0,14717,-0.682129,0,9666,933.0,25.0,8164825,-909.0,20,70,35.34375,30.0,309350,-5.359375,0,717,204.875,164.0,1792478,-63.875,1,98,32.34375,22.0,6479986,13.648438,1,4,1.775391,2.0,355583,-0.775391,0,9832,1174.0,451.0,235051256,-1150.0,20,70,39.6875,30.0,7949210,-9.6875,0,724,159.0,111.0,31837015,-17.96875,1,98,36.8125,32.0,2241158,9.179688,1,4,1.746094,2.0,106288,-0.746094,0,9687,918.5,396.0,55923420,-894.5,20,70,38.8125,30.0,2361770,-8.796875,0,724,145.5,97.0,8859304,-4.539062,1,46,20.078125,17.5,241,25.921875,1,2,1.583008,2.0,19,-0.583496,6,5228,665.5,210.5,7985,-641.5,30,70,34.15625,30.0,410,-4.167969,0,464,275.25,335.0,3303,-134.25,1,98,27.734375,21.0,5581957,18.265625,1,4,1.769531,2.0,356137,-0.769043,0,9832,984.0,328.0,198126563,-960.0,20,70,36.125,30.0,7274630,-6.136719,0,727,177.125,133.0,35660114,-36.15625,12,12,61981,117618,1461,1476,8196,8749,75385,200278,40291,60872,79320,247729,99115,569874,26848,33895,95618,464633,96082,467111,95216,448670,98170,535707,99200,576563,90440,380682,59056,107820,98088,523528,1,12,77654,201303,16,117,1.464844,1.367188,1.462891,1.510742,1.507812,1.427734,1.463867,1.456055,1.46875,1.467773,1.469727,1.463867,1.463867,1.4375,1.447266,1.462891,1.75,1.469727,1.430664


# LightGBM

In [None]:
lgb_best_params = {'colsample_bytree': 0.5265534343795216,
 'learning_rate': 0.03758663589227534,
 'max_depth': 8,
 'metric': 'rmse',
 'min_child_samples': 49,
 'min_data_per_group': 153,
 'n_estimators': 10000,
 'n_jobs': -1,
 'num_leaves': 75,
 'objective': 'regression',
 'random_state': 19920803,
 'reg_alpha': 1.1566640015760366e-07,
 'reg_lambda': 0.06328070355543179,
 'subsample': 0.2965389310542902}

In [None]:
lgb=LGBMRegressor(**lgb_best_params)
lgb_preds, lgb_scores, lgb_model, lgb_oof = run_clf(lgb, fit_params, train_df_low_mem, test_df_low_mem, features, TARGET_COL, folds=N_FOLDS)


------------- Fold 1 -------------
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 0.783412	valid_1's rmse: 0.809796
Early stopping, best iteration is:
[544]	training's rmse: 0.781548	valid_1's rmse: 0.809768

RMSE for validation set is 0.80977


------------- Fold 2 -------------
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 0.784255	valid_1's rmse: 0.801998
Early stopping, best iteration is:
[415]	training's rmse: 0.788171	valid_1's rmse: 0.80194

RMSE for validation set is 0.80194


------------- Fold 3 -------------
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 0.783541	valid_1's rmse: 0.809427
Early stopping, best iteration is:
[422]	training's rmse: 0.787157	valid_1's rmse: 0.809351

RMSE for validation set is 0.80935


------------- Fold 4 -------------
Training until validation scores don't improve for 200 rounds.
[500]	training's rmse: 0.783702	valid_1's rmse:

In [None]:
## Feature importance
show_all(get_feature_importance(lgb_model, train_df_low_mem, features))

Unnamed: 0,Feature,Importance
307,tE_areacode,1235
290,tE_Local_Authority_(District),1151
291,tE_Local_Authority_(Highway),925
305,tE_postcode,922
212,postcode_Speed_limit_diff,910
288,areacode_cnt,900
9,children,801
202,postcode_1st_Road_Number_max,799
122,Local_Authority_(Highway)_Speed_limit_diff,793
14,child_ratio,785


In [None]:
lgb_preds, len(lgb_preds)

(array([1.54482548, 1.38576927, 1.50998743, ..., 1.64020465, 1.26543187,
        1.23222905]), 121259)

# XGBOOST

In [None]:
xgb_best_params = {'colsample_bytree': 0.34651527365396184,
 'eval_metric': 'rmse',
 'learning_rate': 0.014543061344542332,
 'max_depth': 5,
 'n_estimators': 10000,
 'objective': 'reg:squarederror',
 'predictor': 'gpu_predictor',
 'random_state': 19920803,
 'reg_alpha': 27.255858264128506,
 'reg_lambda': 11.554739541654067,
 'subsample': 0.40177414453796234,
 'tree_method': 'gpu_hist',
 'use_label_encoder': False}

In [None]:
xgb=XGBRegressor(**xgb_best_params)
xgb_preds, xgb_scores, xgb_model, xgb_oof = run_clf(xgb, fit_params, train_df_low_mem, test_df_low_mem, features, TARGET_COL, folds=N_FOLDS) 


------------- Fold 1 -------------
[0]	validation_0-rmse:1.25159	validation_1-rmse:1.25502
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 200 rounds.
[500]	validation_0-rmse:0.806502	validation_1-rmse:0.810194
[1000]	validation_0-rmse:0.804121	validation_1-rmse:0.810025
[1500]	validation_0-rmse:0.801995	validation_1-rmse:0.809998
Stopping. Best iteration:
[1341]	validation_0-rmse:0.802662	validation_1-rmse:0.809985


RMSE for validation set is 0.8099899888038635


------------- Fold 2 -------------
[0]	validation_0-rmse:1.25307	validation_1-rmse:1.24171
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 200 rounds.
[500]	validation_0-rmse:0.807342	validation_1-rmse:0.802403
[1000]	validation_0-rmse:0.804984	validation_1-rmse:0.802262
[1500]	validation_0-rmse:0.802879	validation_1-rmse:0.802

In [None]:
## Feature Importance
get_feature_importance(xgb_model, train_df_low_mem, features)

Unnamed: 0,Feature,Importance
0,Police_Force,0.017455
81,Local_Authority_(District)_1st_Road_Number_min,0.015405
1,Number_of_Vehicles,0.01524
72,Local_Authority_(District)_Police_Force_median,0.013545
131,1st_Road_Class_Police_Force_avg,0.012644
34,Accident_ts_Is_year_start,0.012135
137,1st_Road_Class_Number_of_Vehicles_avg,0.012102
142,1st_Road_Class_1st_Road_Number_max,0.011961
3,Speed_limit,0.011519
160,Road_Type_Police_Force_max,0.01126


In [None]:
xgb_preds, len(xgb_preds)

(array([1.58761175, 1.36721091, 1.54403672, ..., 1.58520295, 1.26018266,
        1.26013513]), 121259)

## Submission

In [None]:
df_test['preds'] = (0.6 * lgb_preds) + (0.4 * xgb_preds)

In [None]:
sub_df = df_test.groupby(['postcode']).agg({'preds':'sum', 'Accident_ID':'count'}).reset_index()
sub_df['Accident_risk_index'] = sub_df['preds']/sub_df['Accident_ID']
sub_df

Unnamed: 0,postcode,preds,Accident_ID,Accident_risk_index
0,AB10 1AU,1.339561,1,1.339561
1,AB10 1PG,2.751288,2,1.375644
2,AB10 1TT,1.529009,1,1.529009
3,AB10 1YP,4.335528,3,1.445176
4,AB10 6LQ,1.320272,1,1.320272
...,...,...,...,...
49767,ZE2 9LZ,1.333191,1,1.333191
49768,ZE2 9RE,5.427805,4,1.356951
49769,ZE2 9RJ,1.233127,1,1.233127
49770,ZE2 9SB,1.522740,1,1.522740


In [None]:
FILE_TIMESTAMP = datetime.now().strftime('%m%d%Y_%H%M')
MODEL_NAME = 'LGB_XGB'
OUTPUT_FILE_NAME = f"sub_{MODEL_NAME}_{FILE_TIMESTAMP}.csv"

In [None]:
final_sub = sub_df[['postcode', 'Accident_risk_index']]
final_sub

Unnamed: 0,postcode,Accident_risk_index
0,AB10 1AU,1.339561
1,AB10 1PG,1.375644
2,AB10 1TT,1.529009
3,AB10 1YP,1.445176
4,AB10 6LQ,1.320272
...,...,...
49767,ZE2 9LZ,1.333191
49768,ZE2 9RE,1.356951
49769,ZE2 9RJ,1.233127
49770,ZE2 9SB,1.522740


In [None]:
final_sub.to_csv(OUTPUT_FILE_NAME, index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>