In [None]:
import os
print(os.listdir("../"))

In [None]:
import numpy as np
import pandas as pd

In [None]:
# import dataset
accidents = pd.read_csv("/kaggle/input/dataset0/accidents.csv")

# take a look at first entries
accidents.head()

In [None]:
accidents.info()


In [None]:
# number of rows by number of columns
accidents.shape

In [None]:
# check for 'NaN' values
accidents.isnull().sum()

In [None]:
# Option 1: Remove rows with missing values
accidents_cleaned = accidents.dropna()

In [None]:
# Option 2: Impute missing values (e.g., fill with mean, median, mode)
numeric_columns = accidents.select_dtypes(include=['float64', 'int64']).columns
accidents_imputed = accidents.copy()
for column in numeric_columns:
    accidents_imputed[column] = accidents[column].fillna(accidents[column].mean())


In [None]:
# Option 3: Impute missing values using more advanced techniques (e.g., predictive modeling)
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
accidents_imputed_sklearn = accidents.copy()
accidents_imputed_sklearn[numeric_columns] = imputer.fit_transform(accidents[numeric_columns])

In [None]:
# After handling missing values, you can save the cleaned dataset to a new file
accidents_cleaned.to_csv("accidents_cleaned.csv", index=False)
accidents_imputed.to_csv("accidents_imputed.csv", index=False)
accidents_imputed_sklearn.to_csv("accidents_imputed_sklearn.csv", index=False)

In [None]:
# Check the columns in the dataset
print("Original columns:\n", accidents.columns)


In [None]:
# List of columns to keep (modify this list based on your requirements)
columns_to_keep = ['Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude', 'Police_Force', 'Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties', 'Date', 'Day_of_Week', 'Time', 'Local_Authority_(District)', 'Local_Authority_(Highway)', '1st_Road_Class', '1st_Road_Number', 'Road_Type', 'Speed_limit', 'Junction_Detail', 'Junction_Control', '2nd_Road_Class', '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions', 'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location']

In [None]:
# Keep only the selected columns
print("Column names in the dataset:\n", accidents.columns)

In [None]:
# Check the columns after selection
print("Selected columns:\n", accidents.columns)


In [None]:
# After eliminating extra variables, you can save the updated dataset to a new file
accidents.to_csv("accidents_columns.csv", index=False)

In [None]:

# Check the shape of the dataset before removing duplicates
print("Shape of the dataset before removing duplicates:", accidents.shape)


In [None]:
# Remove duplicates
accidents_no_duplicates = accidents.drop_duplicates()

In [None]:
# Check the shape of the dataset after removing duplicates
print("Shape of the dataset after removing duplicates:", accidents_no_duplicates.shape)

In [None]:

# Optionally, you can save the dataset without duplicates to a new file
accidents_no_duplicates.to_csv("accidents_no_duplicates.csv", index=False)

In [None]:
import cufflinks as cf
import plotly.offline

In [None]:
# Use Cufflinks with Plotly in offline mode
cf.go_offline()

# Create a box plot to visualize the distribution of a numeric variable
# For example, let's use the 'Number_of_Vehicles' column
accidents['Number_of_Vehicles'].iplot(kind='box', title='Box Plot of Number of Vehicles')


In [None]:
# Detect and remove outliers based on a threshold
# For example, let's remove outliers for the 'Number_of_Vehicles' column using the Interquartile Range (IQR) method
Q1 = accidents['Number_of_Vehicles'].quantile(0.25)
Q3 = accidents['Number_of_Vehicles'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Remove outliers
accidents_no_outliers = accidents[(accidents['Number_of_Vehicles'] >= lower_bound) & (accidents['Number_of_Vehicles'] <= upper_bound)]

In [None]:
#Check the shape of the dataset after removing outliers
print("Shape of the dataset after removing outliers:", accidents_no_outliers.shape)

# Optional: Create a box plot after removing outliers to verify
accidents_no_outliers['Number_of_Vehicles'].iplot(kind='box', title='Box Plot of Number of Vehicles (Without Outliers)')

فاصل

In [None]:
# Create a function to categorize accident severity
def category(accident_severity):
    if accident_severity == 1:
        return 'Fatal'
    elif accident_severity == 2:
        return 'Serious'
    elif accident_severity == 3:
        return 'Slight'
    else:
        return 'Unknown'

In [None]:
# Apply the function and add it as 'Accident_Category' column
accidents['Accident_Category'] = accidents['Accident_Severity'].apply(category)