In [2]:
# Week 1 - Traffic Accident Severity Project: Data Loading & Dataset Summary

import pandas as pd


# Load the Cleaned and Raw Datasets

clean_df = pd.read_csv('../data/cleaned.csv')
raw_df = pd.read_csv('../data/rta.csv')

print("✅ Datasets Loaded Successfully")


# Dataset Shapes

print("\n🔍 Cleaned Dataset Shape:", clean_df.shape)
print("🔍 Raw Dataset Shape:", raw_df.shape)


# Column Names

print("\n🧱 Columns in Cleaned Dataset:\n", clean_df.columns.tolist())
print("\n🧱 Columns in Raw Dataset:\n", raw_df.columns.tolist())


# First 5 Rows (Data Preview)

print("\n🔎 Cleaned Dataset Sample:")
print(clean_df.head())

print("\n🔎 Raw Dataset Sample:")
print(raw_df.head())


# Data Types and Missing Values

print("\n🛠 Cleaned Dataset Info:")
print(clean_df.info())

print("\n🛠 Raw Dataset Info:")
print(raw_df.info())

print("\n🧼 Missing Values in Cleaned Dataset:")
print(clean_df.isnull().sum())

print("\n🧼 Missing Values in Raw Dataset:")
print(raw_df.isnull().sum())


# Descriptive Statistics

print("\n📊 Cleaned Dataset Description:")
print(clean_df.describe(include='all'))

print("\n📊 Raw Dataset Description:")
print(raw_df.describe(include='all'))


# Target Variable Distribution

print("\n🎯 Accident Severity Distribution (Cleaned Dataset):")
print(clean_df['Accident_severity'].value_counts(normalize=True) * 100)

print("\n🎯 Accident Severity Distribution (Raw Dataset):")
print(raw_df['Accident_severity'].value_counts(normalize=True) * 100)


# Compare Unique Values in Common Columns

common_columns = list(set(clean_df.columns) & set(raw_df.columns))
print(f"\n🔁 Common Columns ({len(common_columns)}):\n", common_columns)

for col in common_columns:
    print(f"\n🔹 Column: {col}")
    print("  Unique in Cleaned:", clean_df[col].nunique())
    print("  Unique in Raw:   ", raw_df[col].nunique())


✅ Datasets Loaded Successfully

🔍 Cleaned Dataset Shape: (12316, 15)
🔍 Raw Dataset Shape: (12316, 32)

🧱 Columns in Cleaned Dataset:
 ['Age_band_of_driver', 'Sex_of_driver', 'Educational_level', 'Vehicle_driver_relation', 'Driving_experience', 'Lanes_or_Medians', 'Types_of_Junction', 'Road_surface_type', 'Light_conditions', 'Weather_conditions', 'Type_of_collision', 'Vehicle_movement', 'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity']

🧱 Columns in Raw Dataset:
 ['Time', 'Day_of_week', 'Age_band_of_driver', 'Sex_of_driver', 'Educational_level', 'Vehicle_driver_relation', 'Driving_experience', 'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle', 'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians', 'Road_allignment', 'Types_of_Junction', 'Road_surface_type', 'Road_surface_conditions', 'Light_conditions', 'Weather_conditions', 'Type_of_collision', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Vehicle_movement', 'Casualty_class', 'Sex_of_c