The Base data cleaning

In [1]:
import pandas as pd

df = pd.read_csv("Traffic_Violations.csv")

# Standardizeing column name
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [2]:
# Converting date and time
df['date_of_stop'] = pd.to_datetime(df['date_of_stop'], errors='coerce')
df['time_of_stop'] = pd.to_datetime(df['time_of_stop'], format='%H:%M:%S', errors='coerce').dt.time

# Extracting the features for forecasting
df['hour'] = pd.to_datetime(df['time_of_stop'], errors='coerce').dt.hour
df['day_of_week'] = df['date_of_stop'].dt.day_name()
df['month'] = df['date_of_stop'].dt.month

# Converting coordinates to numeric for mapping
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Convert the Yes/No or True/False columns to Boolean
bool_cols = ['accident', 'belts', 'personal_injury', 'property_damage', 'fatal',
             'commercial_license', 'hazmat', 'commercial_vehicle', 'alcohol', 'work_zone',
             'search_conducted', 'search_person', 'search_vehicle', 'contraband_found',
             'attributed_to_accident']
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})


# Filling in missing categorical values
fill_cols = ['gender', 'race', 'driver_city', 'driver_state', 'vehicle_type', 'make', 'model', 'arrest_type']
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Dropping invalid or missing essential values
df.dropna(subset=['date_of_stop', 'latitude', 'longitude'], inplace=True)
df.drop_duplicates(inplace=True)


In [3]:

df.to_csv("cleaned_traffic_violations.csv", index=False)
print("Cleaned dataset saved as 'cleaned_traffic_violations.csv'")

Cleaned dataset saved as 'cleaned_traffic_violations.csv'


Cleaning the dataset to make it specific for this feature

Reading the data from the csv and checking to make sure that I am accessing the dataset and identifying the current data types and removing any attributes I do not need.

In [3]:
#I only want to do this section once
import pandas as pd
#Reading from the base cleaned csv & loading the dataframe
data = pd.read_csv("cleaned_traffic_violations.csv")

#Testing if it read from the csv
print(data.head(15)) 
print('\n')

#Before 

#Identifying the current data types
print("Exploring the Data")
print(data.info())
print('\n')

                                   seqid date_of_stop time_of_stop agency  \
0   52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca   2023-05-01     23:11:00    MCP   
1   9be35886-e00c-49c2-8f27-2f6307696a17   2023-11-25     00:20:00    MCP   
2   9be35886-e00c-49c2-8f27-2f6307696a17   2023-11-25     00:20:00    MCP   
3   4d37fa99-0df3-4a56-9ba6-692bce894a34   2023-11-26     09:16:00    MCP   
4   3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49   2023-11-25     05:45:00    MCP   
5   3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49   2023-11-25     05:45:00    MCP   
6   66273a8e-980e-413e-8928-56447e3be407   2023-11-25     01:21:00    MCP   
7   78cdf309-9fe8-46de-892e-19e9b1bafacd   2023-11-25     03:22:00    MCP   
8   78cdf309-9fe8-46de-892e-19e9b1bafacd   2023-11-25     03:22:00    MCP   
9   1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   2023-11-24     23:25:00    MCP   
10  1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   2023-11-24     23:25:00    MCP   
11  1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   2023-11-24     23:25:00    MCP   

In [5]:
#for this feature I only require these attributes [latitude, longitude, accident, belts, personal_injury, property_damage, fatal, hazmat, alcohol]
dropping_these_attributes = ["seqid", "date_of_stop", "time_of_stop", "agency", "subagency", "description", "location", "commercial_license", "commercial_vehicle", "work_zone", "search_conducted", "search_disposition", "search_outcome", "search_reason", "search_reason_for_stop", "search_type", "search_arrest_reason", "state", "vehicletype", "year", "make", "model", "color", "violation_type", "charge", "article", "contributed_to_accident", "race", "gender", "driver_city", "driver_state", "dl_state", "arrest_type", "geolocation", "hour", "day_of_week", "month"]
data.drop(columns=dropping_these_attributes, inplace= True, errors= "ignore")

For the attributes I did keep, for the categorical attributes I want to check for uniqueness of categories. 

In [4]:
#I want to test this section multiple times

#Checking the values in my categorical attributes to ensure correct groupings

print("Unique Accident Values")
print(data["accident"].unique())
print('\n')

print("Unique Belts Values")
print(data["belts"].unique())
print('\n')

print("Unique Personal Injury Values")
print(data["personal_injury"].unique())
print('\n')
	
print("Unique Property Damage Values")
print(data["property_damage"].unique())
print('\n')

print("Unique Fatal Values")
print(data["fatal"].unique())
print('\n')

print("Unique HAZMAT Values")
print(data["hazmat"].unique())
print('\n')

print("Unique Alcohol Values")
print(data["alcohol"].unique())
print('\n')

#Checking for missing fields
print('========== Missing Value Counts ==========')
print(data.isnull().sum())
print('\n')

#converting the yes's and no's to 1's and 0's
list_of_yes_no_columns = ["accident", "belts", "personal_injury", "property_damage", "fatal","hazmat", "alcohol"]
data[list_of_yes_no_columns] = data[list_of_yes_no_columns].replace({True: 1, False: 0})

#casting datatypes and downcasting for reduced memory usage
data[list_of_yes_no_columns] = data[list_of_yes_no_columns].astype("int32")

lat_long = ["latitude", "longitude"]
data[lat_long] = data[lat_long].astype("float32")

#After 
#Testing for changes
print("Exploring the Data")
print(data.info())
print('\n')

Unique Accident Values
[False  True]


Unique Belts Values
[False  True]


Unique Personal Injury Values
[False  True]


Unique Property Damage Values
[False  True]


Unique Fatal Values
[False  True]


Unique HAZMAT Values
[False  True]


Unique Alcohol Values
[False  True]


seqid                            0
date_of_stop                     0
time_of_stop                     0
agency                           0
subagency                        0
description                     10
location                         1
latitude                         0
longitude                        0
accident                         0
belts                            0
personal_injury                  0
property_damage                  0
fatal                            0
commercial_license               0
hazmat                           0
commercial_vehicle               0
alcohol                          0
work_zone                        0
search_conducted            621547
search_disposition    

  data[list_of_yes_no_columns] = data[list_of_yes_no_columns].replace({True: 1, False: 0})


Exploring the Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462400 entries, 0 to 1462399
Data columns (total 46 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   seqid                    1462400 non-null  object 
 1   date_of_stop             1462400 non-null  object 
 2   time_of_stop             1462400 non-null  object 
 3   agency                   1462400 non-null  object 
 4   subagency                1462400 non-null  object 
 5   description              1462390 non-null  object 
 6   location                 1462399 non-null  object 
 7   latitude                 1462400 non-null  float32
 8   longitude                1462400 non-null  float32
 9   accident                 1462400 non-null  int32  
 10  belts                    1462400 non-null  int32  
 11  personal_injury          1462400 non-null  int32  
 12  property_damage          1462400 non-null  int32  
 13  fatal                  

For the attributes I am using for this feature, there are no missing or duplicate category values

In [5]:
#I only want to do this section once

#updating the csv with the changes 
data.to_csv("feature_specific_cleaned_traffic_violations.csv", index=False)

In [7]:
data

Unnamed: 0,seqid,date_of_stop,time_of_stop,agency,subagency,description,location,latitude,longitude,accident,...,race,gender,driver_city,driver_state,dl_state,arrest_type,geolocation,hour,day_of_week,month
0,52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca,2023-05-01,23:11:00,MCP,"3rd District, Silver Spring",OPERATING UNREGISTERED MOTOR VEHICLE ON HIGHWAY,BRIGGS CHANEY RD @ COLUMIBA PIKE,0.000000,0.000000,0,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Monday,5
1,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO DISPLAY REGISTRATION CARD UPON DEMA...,GEORGIA AVE / WEISMAN RD,39.052963,-77.051308,0,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
2,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",DISPLAYING EXPIRED REGISTRATION PLATE ISSUED B...,GEORGIA AVE / WEISMAN RD,39.052963,-77.051308,0,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
3,4d37fa99-0df3-4a56-9ba6-692bce894a34,2023-11-26,09:16:00,MCP,"4th District, Wheaton",DRIVING VEHICLE WHILE UNDER THE INFLUENCE OF A...,3803 WELLER RD,39.058380,-77.049652,0,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0583783333333, -77.0496516666667)",,Sunday,11
4,3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49,2023-11-25,05:45:00,MCP,"4th District, Wheaton",RECKLESS DRIVING VEHICLE IN WANTON AND WILLFUL...,OLNEY LAYTONSVILLE RD @ FIELDCREST RD,0.000000,0.000000,0,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Saturday,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1462395,d01695a7-b38a-4b3a-a965-295286c371ed,2016-06-27,15:18:00,MCP,"1st District, Rockville",DRIVING VEH. ON HWY. WITH SUSPENDED REGISTRATION,DARNESTOWN RD / GREAT SENECA HWY,39.096512,-77.202515,0,...,WHITE,F,POTOMAC,MD,MD,A - Marked Patrol,"(39.0965116666667, -77.202515)",,Monday,6
1462396,fbe051bd-36f2-4a9a-934c-d53b4d8984aa,2018-12-09,17:23:00,MCP,"3rd District, Silver Spring",DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...,SB 29 AT TECH RD,39.058155,-76.968201,0,...,WHITE,F,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0581566666667, -76.9681983333333)",,Sunday,12
1462397,6f4a0eb8-302f-4ab7-9607-3d2bb77278e2,2014-08-03,01:12:00,MCP,"4th District, Wheaton",FAILURE TO EQUIP VEH. WITH REQUIRED REAR STOP ...,GRANDVIEW AVE @ ENNALLS AVE,39.040470,-77.052399,0,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0404716666667, -77.0523966666667)",,Sunday,8
1462398,ec9ff018-90b7-4bc2-8120-7f95066c2682,2019-10-29,21:28:00,MCP,"5th District, Germantown",FAILURE OF VEH. ON HWY. TO DISPLAY LIGHTED LAM...,CRYSTAL ROCK/118,39.180729,-77.262192,0,...,BLACK,M,GERMANTOWN,MD,MD,A - Marked Patrol,"(39.18073, -77.26219)",,Tuesday,10


Model 1: Logic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

In [9]:
features = ["latitude", "longitude"]
violations = ["accident", "belts", "personal_injury", "property_damage", "fatal", "hazmat","alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  logr = LogisticRegression(max_iter=1000)
  logr.fit(X, y [:, loc])
  models[violation] = logr

def probability_function (logr, user_input_values):
  return logr.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")

Enter your location details: Latitude and Longitude 

Top Most Likely Traffic Violations Based on Location


belts: 0.04
accident: 0.03
property_damage: 0.03
personal_injury: 0.02


Model 2: Random Forest

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
features = ["latitude", "longitude"]
violations = ["accident", "belts", "personal_injury", "property_damage", "fatal", "hazmat","alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  rfc = RandomForestClassifier(n_estimators= 10, random_state=5)
  rfc.fit(X, y [:, loc])
  models[violation] = rfc

def probability_function (rfc, user_input_values):
  return rfc.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")


NameError: name 'data' is not defined

Model 3: Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
features = ["latitude", "longitude"]
violations = ["accident", "belts", "personal_injury", "property_damage", "fatal", "hazmat","alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  gbc = GradientBoostingClassifier(n_estimators= 10, learning_rate=0.2)
  gbc.fit(X, y [:, loc])
  models[violation] = gbc

def probability_function (gbc, user_input_values):
  return gbc.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")


Enter your location details: Latitude and Longitude 



Models Accuracy, Mean Absoute Error and Root Mean Squre Error

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

#training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=5)

#Setting the models

m1= RandomForestClassifier(n_estimators=10, random_state=5)
m2= GradientBoostingClassifier(n_estimators=10, learning_rate= 0.2)
m3= LogisticRegression(max_iter=10)

models_set = {
    "Random Forest Classifier":m1,
    "Gradient Boosting Classifier":m2,
    "Logistic Regression":m3
}

for v, violation in enumerate(violations):
    print(f"{violation}")
    
    for name, model in models_set.items():
        print(f"{name}")

        model.fit(X_train, y_train[:, v])

        m_pred = model.predict(X_test)

        model_accuracy = accuracy_score(y_test[:, v], m_pred)
        mae = mean_absolute_error(y_test[:, v], m_pred)
        rmse = np.sqrt(mean_squared_error(y_test[:, v], m_pred))

        print(f"Accuracy: {model_accuracy: .2f}")
        print(f"MAE:{mae: .2f}")
        print(f"RSME: {rmse: .2f}")
        print("\n")
    

accident
Random Forest Classifier
Accuracy:  0.98
MAE: 0.02
RSME:  0.13


Gradient Boosting Classifier
Accuracy:  0.97
MAE: 0.03
RSME:  0.17


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.97
MAE: 0.03
RSME:  0.17


belts
Random Forest Classifier
Accuracy:  0.98
MAE: 0.02
RSME:  0.13


Gradient Boosting Classifier
Accuracy:  0.97
MAE: 0.03
RSME:  0.18


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.97
MAE: 0.03
RSME:  0.18


personal_injury
Random Forest Classifier
Accuracy:  0.99
MAE: 0.01
RSME:  0.07


Gradient Boosting Classifier
Accuracy:  0.99
MAE: 0.01
RSME:  0.11


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.99
MAE: 0.01
RSME:  0.11


property_damage
Random Forest Classifier
Accuracy:  0.99
MAE: 0.01
RSME:  0.10


Gradient Boosting Classifier
Accuracy:  0.98
MAE: 0.02
RSME:  0.15


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.98
MAE: 0.02
RSME:  0.15


fatal
Random Forest Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.01


Gradient Boosting Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.02


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  1.00
MAE: 0.00
RSME:  0.02


hazmat
Random Forest Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.01


Gradient Boosting Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.01


Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  1.00
MAE: 0.00
RSME:  0.01


alcohol
Random Forest Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.02


Gradient Boosting Classifier
Accuracy:  1.00
MAE: 0.00
RSME:  0.04


Logistic Regression
Accuracy:  1.00
MAE: 0.00
RSME:  0.04




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
import joblib
joblib.dump(models, 'logistic_regression_violation_model.pkl')

['logistic_regression_violation_model.pkl']