<a href="https://colab.research.google.com/github/Mennakurdi/Data-Engineering-Project/blob/colab-Habiba/Persons_Dataset_Cleaning_%26_Crash_Person_Integration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Integration & Persons Analysis 3. How does the distribution of crashes vary by victim type (PEDESTRIAN vs CYCLIST vs MOTORIST) across boroughs? 4. Is there a relationship between age groups of people involved in crashes and the severity of injuries (injured vs killed)?

In [None]:
# --- Distribution of crashes by Borough ---
df_selected['borough'].value_counts(dropna=False).plot(kind='bar', figsize=(8,4), title='Crashes per Borough')



In [None]:
# --- Top contributing factors ---
df_selected['contributing_factor_vehicle_1'].value_counts().head(10).plot(kind='barh', title='Top Contributing Factors')

Insights:

Brooklyn and Queens have the most reported crashes, suggesting higher traffic volume or density.

“Driver inattention/distraction” and “Failure to yield right-of-way” are the leading causes.

Borough-wise differences hint at potential focus areas for traffic safety policies.

In [None]:
#  Step 1: import libraries
import pandas as pd
import matplotlib.pyplot as plt

#  Step 2: load persons dataset
url_persons = "https://data.cityofnewyork.us/resource/f55k-p6yu.csv"
df_persons = pd.read_csv(url_persons, nrows=50000)   # load sample for preview

#  Step 3: preview
print("Shape:", df_persons.shape)
df_persons.head()


In [None]:
#  Step 4: inspect columns and data types
df_persons.info()


In [None]:
#  Step 5: select key columns
important_cols = ['person_type', 'person_age', 'person_sex', 'person_injury']
df_selected = df_persons[important_cols]

# Preview
df_selected.head()


In [None]:
#  Step 6: count gender distribution
gender_counts = df_selected['person_sex'].value_counts(dropna=False)

#  Step 7: visualize
plt.figure(figsize=(6,4))
gender_counts.plot(kind='bar', color='plum', edgecolor='black')

plt.title('Gender Distribution of People in Crashes', fontsize=13)
plt.xlabel('Gender (M=Male, F=Female, U=Unknown, NaN=Missing)')
plt.ylabel('Number of Records')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

gender_counts


In [None]:
#  Step 8: plot age distribution
plt.figure(figsize=(8,5))
df_selected['person_age'].dropna().plot(kind='hist', bins=20, color='lightblue', edgecolor='black')

plt.title('Age Distribution of People Involved in Crashes', fontsize=13)
plt.xlabel('Age')
plt.ylabel('Number of People')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
#  Step 9: count injury types
injury_counts = df_selected['person_injury'].value_counts()

#  Step 10: visualize
plt.figure(figsize=(8,5))
injury_counts.plot(kind='bar', color='lightgreen', edgecolor='black')

plt.title('Types of Injuries in Crashes', fontsize=13)
plt.xlabel('Injury Type')
plt.ylabel('Number of People')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

injury_counts


In [None]:
import matplotlib.pyplot as plt

# --- Person type distribution ---
if 'person_type' in df_persons.columns:
    df_persons['person_type'].value_counts().plot(
        kind='bar',
        figsize=(8,4),
        color='skyblue',
        edgecolor='black',
        title='Person Type Distribution'
    )
    plt.xlabel('Person Type')
    plt.ylabel('Count')
    plt.show()

# --- Gender analysis ---
if 'person_sex' in df_persons.columns:
    df_persons['person_sex'].value_counts().plot(
        kind='pie',
        autopct='%1.1f%%',
        figsize=(5,5),
        startangle=90,
        colors=['lightcoral', 'lightblue', 'lightgray']
    )
    plt.title('Gender Distribution in Crashes')
    plt.ylabel('')
    plt.show()

# --- Age distribution ---
if 'person_age' in df_persons.columns:
    df_persons['person_age'].dropna().plot(
        kind='hist',
        bins=20,
        color='orange',
        edgecolor='black',
        figsize=(8,4)
    )
    plt.title('Age Distribution of People Involved in Crashes')
    plt.xlabel('Age')
    plt.ylabel('Number of People')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


In [None]:
#  Step 1: import libraries
import pandas as pd

#  Step 2: load cleaned versions or raw samples
crash = pd.read_csv('https://data.cityofnewyork.us/resource/h9gi-nx95.csv', nrows=50000)
persons = pd.read_csv('https://data.cityofnewyork.us/resource/f55k-p6yu.csv', nrows=50000)

print("Crashes:", crash.shape)
print("Persons:", persons.shape)




## 2. Pre-Integration Cleaning

In this step, we clean both samples **before merging**:

For the **crash** dataset:
- Convert `crash_date` to datetime for later time-based analysis.
- Standardize all text columns (strip spaces, uppercase, treat `"NAN"` strings as missing).
- Remove duplicate collisions using `collision_id`.
- Clean `borough`:
  - Treat inconsistent values like `''`, `NONE`, `NULL`, `N/A`, `UNK`, `UNSPECIFIED` as missing.
  - Fill remaining missing boroughs with `"UNKNOWN"` (instead of dropping rows).
- Ensure injury counts are numeric and replace missing/invalid with 0.

For the **persons** dataset:
- Convert `crash_date` to datetime (to match crash table if we need it).
- Standardize text columns similarly (strip + uppercase, `"NAN"` → NaN).

We **do not** yet drop person-level columns; this will be done after merging.



In [None]:
import pandas as pd
import numpy as np

# assume df_crashes and df_persons are already loaded
print("Crashes:", crash.shape)
print("Persons:", persons.shape)

# 1️⃣ convert dates
crash['crash_date']  = pd.to_datetime(crash['crash_date'],  errors='coerce')
persons['crash_date']  = pd.to_datetime(persons['crash_date'],  errors='coerce')

# 2️⃣ standardize text columns (strip + uppercase)
for c in crash.select_dtypes('object').columns:
    crash[c] = crash[c].astype(str).str.strip().str.upper().replace('NAN', np.nan)
for c in persons.select_dtypes('object').columns:
    persons[c] = persons[c].astype(str).str.strip().str.upper().replace('NAN', np.nan)

# 3️⃣ remove duplicates
before = len(crash)
crash.drop_duplicates(subset=['collision_id'], inplace=True)
print("Removed", before-len(crash), "duplicate crash rows")

# 4️⃣ fill missing boroughs
crash['borough'] = crash['borough'].fillna('UNKNOWN')

# Better borough cleaning
crash['borough'] = crash['borough'].replace(
    ['', 'NONE', 'NULL', 'N/A', 'UNK', 'UNSPECIFIED'],
    np.nan
)
crash['borough'] = crash['borough'].fillna('UNKNOWN')


# 5️⃣ fill numeric NaNs
for c in ['number_of_persons_injured','number_of_persons_killed']:
    if c in crash.columns:
        crash[c] = crash[c].fillna(0)

crash.info()

In [None]:
# 3.x Outlier detection using IQR for key numerical columns

import numpy as np

numeric_cols = []
for col in ['number_of_persons_injured', 'number_of_persons_killed',
            'number_of_pedestrians_injured', 'number_of_pedestrians_killed',
            'number_of_cyclist_injured', 'number_of_cyclist_killed',
            'number_of_motorist_injured', 'number_of_motorist_killed']:
    if col in crash.columns:
        numeric_cols.append(col)

print("Numeric columns considered for outlier detection:", numeric_cols)

def iqr_outlier_flags(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return (series < lower) | (series > upper), (lower, upper)

outlier_summary = {}

for col in numeric_cols:
    s = pd.to_numeric(crash[col], errors='coerce')
    flags, (lower, upper) = iqr_outlier_flags(s.dropna())
    outlier_count = flags.sum()
    outlier_summary[col] = {
        "lower_bound": float(lower),
        "upper_bound": float(upper),
        "outlier_count": int(outlier_count)
    }

outlier_summary


Outlier Detection and Treatment
We checked numerical and coordinate columns for potential outliers that could distort analysis results.
The focus was on:

number_of_persons_injured
number_of_persons_killed
latitude, longitude
person_age
We applied simple domain-based rules and the Interquartile Range (IQR) method. No extreme or unrealistic values were found (e.g., injuries > 20 or ages > 120).
All remaining values fall within expected domain ranges, so no outlier removal was necessary.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Check for numeric outliers in crash dataset ---
num_cols = ['number_of_persons_injured','number_of_persons_killed']
for c in num_cols:
    if c in crash.columns:
        plt.figure(figsize=(6,3))
        sns.boxplot(x=crash[c])
        plt.title(f'Boxplot of {c}')
        plt.show()
        print(crash[c].describe(), "\n")

# --- Check for outliers in person_age (if present) ---
if 'person_age' in persons.columns:
    plt.figure(figsize=(6,3))
    sns.boxplot(x=persons['person_age'].dropna())
    plt.title('Boxplot of person_age')
    plt.show()
    print(persons['person_age'].describe())

# --- Quick coordinate sanity check ---
if 'latitude' in crash.columns and 'longitude' in crash.columns:
    invalid_lat = crash[(crash['latitude'] > 90) | (crash['latitude'] < -90)]
    invalid_lon = crash[(crash['longitude'] > 180) | (crash['longitude'] < -180)]
    print(f"Invalid latitude rows: {len(invalid_lat)}, invalid longitude rows: {len(invalid_lon)}")

# Create merged_df here to make it available for the bounding box check
# This merge operation was originally in cell 'nv_E4VT3cFD3'
merged_df = crash.merge(
    persons,
    on='collision_id',
    how='left',
    suffixes=('_crash','_person')
)

# NYC valid bounding box check
invalid_coords = merged_df[
    (~merged_df['latitude'].between(40.49, 40.92)) |
    (~merged_df['longitude'].between(-74.27, -73.68))
]

print("Invalid coordinate rows:", len(invalid_coords))


In [None]:
merged_df = crash.merge(
    persons,
    on='collision_id',
    how='left',
    suffixes=('_crash','_person')
)
print("Merged shape →", merged_df.shape)
merged_df.head()


    Explored and cleaned the persons dataset (gender, age distributions).
    Helped define injury-type logic (pedestrian/cyclist/motorist) used later in the dashboard.
    Assisted with missing values handling and encoding of categorical features.
