In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the dataset from a CSV file
df = pd.read_csv('/kaggle/input/traffic-accidents/traffic_accidents.csv')
# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
display(df.head())

# Display basic information about the dataset
print("\nDataset Information:")
df.info()

# Check for missing values
print("\nMissing Values:")
missing_values = df.isnull().sum()
display(missing_values[missing_values > 0])

# Display basic statistics for numerical columns
print("\nDescriptive Statistics (Numerical Columns):")
display(df.describe())

# Display basic statistics for categorical columns
print("\nDescriptive Statistics (Categorical Columns):")
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
display(df[categorical_columns].describe())
# Check for duplicate rows
print("\nNumber of Duplicate Rows:")
print(df.duplicated().sum())

# Display the shape of the dataset
print("\nDataset Shape (Rows, Columns):")
print(df.shape)

# Display column names
print("\nColumn Names:")
print(df.columns.tolist())

# Check unique values in each column (optional, if dataset is not too large)
print("\nUnique Values in Each Column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")



First 5 rows of the dataset:


Unnamed: 0,crash_date,traffic_control_device,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,crash_type,...,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,crash_hour,crash_day_of_week,crash_month
0,07/29/2023 01:00:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,13,7,7
1,08/13/2023 12:11:00 AM,TRAFFIC SIGNAL,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,2.0,0,1,8
2,12/09/2021 10:30:00 AM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,REAR END,T-INTERSECTION,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,10,5,12
3,08/09/2023 07:55:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,ANGLE,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,INJURY AND / OR TOW DUE TO CRASH,...,NONINCAPACITATING INJURY,5.0,0.0,0.0,5.0,0.0,0.0,19,4,8
4,08/19/2023 02:55:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,REAR END,T-INTERSECTION,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NO INJURY / DRIVE AWAY,...,NO INDICATION OF INJURY,0.0,0.0,0.0,0.0,0.0,3.0,14,7,8



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209306 entries, 0 to 209305
Data columns (total 24 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   crash_date                     209306 non-null  object 
 1   traffic_control_device         209306 non-null  object 
 2   weather_condition              209306 non-null  object 
 3   lighting_condition             209306 non-null  object 
 4   first_crash_type               209306 non-null  object 
 5   trafficway_type                209306 non-null  object 
 6   alignment                      209306 non-null  object 
 7   roadway_surface_cond           209306 non-null  object 
 8   road_defect                    209306 non-null  object 
 9   crash_type                     209306 non-null  object 
 10  intersection_related_i         209306 non-null  object 
 11  damage                         209306 non-null  object 
 12  prim_con

Series([], dtype: int64)


Descriptive Statistics (Numerical Columns):


Unnamed: 0,num_units,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,crash_hour,crash_day_of_week,crash_month
count,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0,209306.0
mean,2.0633,0.382717,0.001859,0.038102,0.221241,0.121516,2.244002,13.373047,4.144024,6.771822
std,0.396012,0.79972,0.047502,0.233964,0.61496,0.450865,1.241175,5.60383,1.966864,3.427593
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,2.0,9.0,2.0,4.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,2.0,14.0,4.0,7.0
75%,2.0,1.0,0.0,0.0,0.0,0.0,3.0,17.0,6.0,10.0
max,11.0,21.0,3.0,7.0,21.0,15.0,49.0,23.0,7.0,12.0



Descriptive Statistics (Categorical Columns):


Unnamed: 0,crash_date,traffic_control_device,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,crash_type,intersection_related_i,damage,prim_contributory_cause,most_severe_injury
count,209306,209306,209306,209306,209306,209306,209306,209306,209306,209306,209306,209306,209306,209306
unique,189087,19,12,6,18,20,6,7,7,2,2,3,40,5
top,12/29/2020 05:00:00 PM,TRAFFIC SIGNAL,CLEAR,DAYLIGHT,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NO INJURY / DRIVE AWAY,Y,"OVER $1,500",UNABLE TO DETERMINE,NO INDICATION OF INJURY
freq,10,123944,164700,134109,64157,77753,204590,155905,171730,117376,199324,147313,58316,154789



Number of Duplicate Rows:
31

Dataset Shape (Rows, Columns):
(209306, 24)

Column Names:
['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect', 'crash_type', 'intersection_related_i', 'damage', 'prim_contributory_cause', 'num_units', 'most_severe_injury', 'injuries_total', 'injuries_fatal', 'injuries_incapacitating', 'injuries_non_incapacitating', 'injuries_reported_not_evident', 'injuries_no_indication', 'crash_hour', 'crash_day_of_week', 'crash_month']

Unique Values in Each Column:
crash_date: 189087 unique values
traffic_control_device: 19 unique values
weather_condition: 12 unique values
lighting_condition: 6 unique values
first_crash_type: 18 unique values
trafficway_type: 20 unique values
alignment: 6 unique values
roadway_surface_cond: 7 unique values
road_defect: 7 unique values
crash_type: 2 unique values
intersection_related_i: 2 unique values
damage: 3 un