# Initial EDA

## Imports

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Path things
from pathlib import Path

data_dir = Path("../data")
raw_dir = data_dir / "raw"
processed_dir = data_dir / "processed"
extension_dir = data_dir / "extension"

## Load Data

In [3]:
train = pd.read_csv(raw_dir / "train.csv")
test = pd.read_csv(raw_dir / "test.csv")
sample_submission = pd.read_csv(raw_dir / "sample_submission.csv")

ext_data_2 = pd.read_csv(extension_dir / "synthetic_road_accidents_2k.csv")
ext_data_10 = pd.read_csv(extension_dir / "synthetic_road_accidents_10k.csv")
ext_data_100 = pd.read_csv(extension_dir / "synthetic_road_accidents_100k.csv")

In [19]:
train.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [18]:
ext_data_100.head()

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,rural,2,0.29,70,night,rainy,False,True,evening,False,False,1,0.64
1,highway,1,0.34,25,dim,clear,False,False,morning,False,False,3,0.27
2,rural,2,0.76,70,night,foggy,True,False,evening,True,True,1,0.76
3,rural,3,0.37,70,night,foggy,True,False,morning,False,True,0,0.6
4,highway,3,0.39,45,dim,rainy,False,True,morning,False,False,0,0.17


## EDA

In [20]:
# Assert the columns are the same in the competition data and the 'extended' data
train_cols = train.columns
ext_data_cols = ext_data_100.columns

set(train_cols) - set(ext_data_cols), set(ext_data_cols) - set(train_cols)

({'id'}, set())

They have the same columns. The only difference is that the training data from the competition has an ID column. 
We can simply drop this column.

In [37]:
# Check if there is duplicate data among the training set (considering ID and without ID)
print(f"There are {train[train.duplicated(subset=['id'])].shape[0]} duplicated ids in the training set out of {train.shape[0]} rows.")
print(f"There are {train[train.drop(columns=['id']).duplicated()].shape[0]} duplicated rows in the training set (not considering IDs) out of {train.shape[0]} rows.")

# Check for feature duplicates with different target values
print(f"There are {train[train.drop(columns=['id', 'accident_risk']).duplicated()].shape[0]} duplicated rows in the training set (not considering IDs nor target) out of {train.shape[0]} rows.")

There are 0 duplicated ids in the training set out of 517754 rows.
There are 656 duplicated rows in the training set (not considering IDs) out of 517754 rows.
There are 10774 duplicated rows in the training set (not considering IDs nor target) out of 517754 rows.


In [30]:
# Check duplicates in test set
print(f"There are {test[test.duplicated(subset=['id'])].shape[0]} duplicated ids in the test set out of {test.shape[0]} rows.")
print(f"There are {test[test.drop(columns=['id']).duplicated()].shape[0]} duplicated rows in the test set (not considering IDs) out of {test.shape[0]} rows.")

There are 0 duplicated ids in the test set out of 172585 rows.
There are 1203 duplicated rows in the test set (not considering IDs) out of 172585 rows.


In [34]:
# Check for duplicated data between train and test sets (with and without considering IDs)
print(f"There are {train[train['id'].isin(test['id'])].shape[0]} rows in the training set with IDs that are also in the test set.")
print(f"There are {train[train.drop(columns=['id', 'accident_risk']).isin(test.drop(columns=['id'])).all(axis=1)].shape[0]} rows in the training set that are also in the test set (not considering IDs).")

There are 0 rows in the training set with IDs that are also in the test set.
There are 0 rows in the training set that are also in the test set (not considering IDs).
