In [2]:
import pandas as pd

df = pd.read_csv("Traffic_Violations.csv")

# Standardizeing column name
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [3]:
# Converting date and time
df['date_of_stop'] = pd.to_datetime(df['date_of_stop'], errors='coerce')
df['time_of_stop'] = pd.to_datetime(df['time_of_stop'], format='%H:%M:%S', errors='coerce').dt.time

# Extracting the features for forecasting
df['hour'] = pd.to_datetime(df['time_of_stop'], errors='coerce').dt.hour
df['day_of_week'] = df['date_of_stop'].dt.day_name()
df['month'] = df['date_of_stop'].dt.month

# Converting coordinates to numeric for mapping
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Convert the Yes/No or True/False columns to Boolean
bool_cols = ['accident', 'belts', 'personal_injury', 'property_damage', 'fatal',
             'commercial_license', 'hazmat', 'commercial_vehicle', 'alcohol', 'work_zone',
             'search_conducted', 'search_person', 'search_vehicle', 'contraband_found',
             'attributed_to_accident']
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})


# Filling in missing categorical values
fill_cols = ['gender', 'race', 'driver_city', 'driver_state', 'vehicle_type', 'make', 'model', 'arrest_type']
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Dropping invalid or missing essential values
df.dropna(subset=['date_of_stop', 'latitude', 'longitude'], inplace=True)
df.drop_duplicates(inplace=True)


In [4]:

df.to_csv("cleaned_traffic_violations.csv", index=False)
print("Cleaned dataset saved as 'cleaned_traffic_violations.csv'")

Cleaned dataset saved as 'cleaned_traffic_violations.csv'
