Cleaning the dataset

Reading the data from the csv and checking to make sure that I am accessing the dataset and identifying the current data types and removing any attributes I do not need.

In [32]:
#I only want to do this section once

import pandas as pd

#Reading from the csv & loading the dataframe
data = pd.read_csv('Traffic_Violations.csv')

#Testing if it read from the csv
print(data.head(15)) 
print('\n')

#Before 

#Identifying the current data types
print("Exploring the Data")
print(data.info())
print('\n')

#for this feature I only require these attributes [Latitude, Longitude, Accident, Belts, Personal Injury, Property Damage, Fatal, HAZMAT, Alcohol]
dropping_these_attributes = ["SeqID", "Date Of Stop", "Location", "Violation Type,", "Time Of Stop", "Agency", "SubAgency", "Description", "Commercial License", "Commercial Vehicle", "Work Zone", "Search Conducted", "Search Disposition", "Search Outcome", "Search Reason", "Search Reason For Stop", "Search Type", "Search Arrest Reason", "State", "VehicleType", "Year", "Make", "Model", "Color", "Charge", "Article", "Contributed To Accident", "Race", "Gender", "Driver City", "Driver State", "DL State", "Arrest Type", "Geolocation"]
data.drop(columns=dropping_these_attributes, inplace= True, errors= "ignore")

                                   SeqID Date Of Stop Time Of Stop Agency  \
0   52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca   05/01/2023     23:11:00    MCP   
1   9be35886-e00c-49c2-8f27-2f6307696a17   11/25/2023     00:20:00    MCP   
2   9be35886-e00c-49c2-8f27-2f6307696a17   11/25/2023     00:20:00    MCP   
3   4d37fa99-0df3-4a56-9ba6-692bce894a34   11/26/2023     09:16:00    MCP   
4   3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49   11/25/2023     05:45:00    MCP   
5   3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49   11/25/2023     05:45:00    MCP   
6   66273a8e-980e-413e-8928-56447e3be407   11/25/2023     01:21:00    MCP   
7   78cdf309-9fe8-46de-892e-19e9b1bafacd   11/25/2023     03:22:00    MCP   
8   78cdf309-9fe8-46de-892e-19e9b1bafacd   11/25/2023     03:22:00    MCP   
9   1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   11/24/2023     23:25:00    MCP   
10  1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   11/24/2023     23:25:00    MCP   
11  1e24ec33-6a13-483f-9fea-0c2cdc5b2a14   11/24/2023     23:25:00    MCP   

For the attributes I did keep, for the categorical attributes I want to check for uniqueness of categories. 

In [33]:
#I want to test this section multiple times

#Checking the values in my categorical attributes to ensure correct groupings

print("Unique Accident Values")
print(data["Accident"].unique())
print('\n')

print("Unique Belts Values")
print(data["Belts"].unique())
print('\n')

print("Unique Personal Injury Values")
print(data["Personal Injury"].unique())
print('\n')
	
print("Unique Property Damage Values")
print(data["Property Damage"].unique())
print('\n')

print("Unique Fatal Values")
print(data["Fatal"].unique())
print('\n')

print("Unique HAZMAT Values")
print(data["HAZMAT"].unique())
print('\n')

print("Unique Alcohol Values")
print(data["Alcohol"].unique())
print('\n')

#Checking for missing fields
print('========== Missing Value Counts ==========')
print(data.isnull().sum())
print('\n')

#converting the yes's and no's to 1's and 0's
list_of_yes_no_columns = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal","HAZMAT", "Alcohol"]
data[list_of_yes_no_columns] = data[list_of_yes_no_columns].replace({"Yes": 1, "No": 0})

#casting datatypes and downcasting for reduced memory usage
data[list_of_yes_no_columns] = data[list_of_yes_no_columns].astype("int32")

lat_long = ["Latitude", "Longitude"]
data[lat_long] = data[lat_long].astype("float32")

#After 
#Testing for changes
print("Exploring the Data")
print(data.info())
print('\n')

Unique Accident Values
['No' 'Yes']


Unique Belts Values
['No' 'Yes']


Unique Personal Injury Values
['No' 'Yes']


Unique Property Damage Values
['No' 'Yes']


Unique Fatal Values
['No' 'Yes']


Unique HAZMAT Values
['No' 'Yes']


Unique Alcohol Values
['No' 'Yes']


Latitude           0
Longitude          0
Accident           0
Belts              0
Personal Injury    0
Property Damage    0
Fatal              0
HAZMAT             0
Alcohol            0
Violation Type     0
dtype: int64




  data[list_of_yes_no_columns] = data[list_of_yes_no_columns].replace({"Yes": 1, "No": 0})


Exploring the Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1996597 entries, 0 to 1996596
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Latitude         float32
 1   Longitude        float32
 2   Accident         int32  
 3   Belts            int32  
 4   Personal Injury  int32  
 5   Property Damage  int32  
 6   Fatal            int32  
 7   HAZMAT           int32  
 8   Alcohol          int32  
 9   Violation Type   object 
dtypes: float32(2), int32(7), object(1)
memory usage: 83.8+ MB
None




For the attributes I am using for this feature, there are no missing or duplicate category values

In [34]:
#I only want to do this section once

#updating the csv with the changes 
data.to_csv("Traffic_Violations.csv", index= False)

Model 1: Logic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
features = ["Latitude", "Longitude"]
violations = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal", "HAZMAT","Alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  logr = LogisticRegression(max_iter=1000)
  logr.fit(X, y [:, loc])
  models[violation] = logr

def probability_function (logr, user_input_values):
  return logr.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")


Enter your location details: Latitude and Longitude 

Top Most Likely Traffic Violations Based on Location


Belts: 1.00
Accident: 0.00
Personal Injury: 0.00
Property Damage: 0.00


Model 2: Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
features = ["Latitude", "Longitude"]
violations = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal", "HAZMAT","Alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  rfc = RandomForestClassifier(n_estimators= 10, random_state=5)
  rfc.fit(X, y [:, loc])
  models[violation] = rfc

def probability_function (rfc, user_input_values):
  return rfc.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")


Enter your location details: Latitude and Longitude 

Top Most Likely Traffic Violations Based on Location


Personal Injury: 0.50
Accident: 0.00
Belts: 0.00
Property Damage: 0.00


Model 3: Gradient Boosting

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
features = ["Latitude", "Longitude"]
violations = ["Accident", "Belts", "Personal Injury", "Property Damage", "Fatal", "HAZMAT","Alcohol"]

X = data[features].values
y = data[violations].values

#Training the model
models={}
for loc, violation in enumerate(violations):
  gbc = GradientBoostingClassifier(n_estimators= 10, learning_rate=0.2)
  gbc.fit(X, y [:, loc])
  models[violation] = gbc

def probability_function (gbc, user_input_values):
  return gbc.predict_proba([user_input_values])[0][1]
   

#User Inputs
print("Enter your location details: Latitude and Longitude \n")

latitude_input_value = float(input("Enter your Latitude: "))
longitude_input_value = float(input("Enter your Longitude: "))

user_input_values = [latitude_input_value, longitude_input_value]

result = []
for v in violations:
  p = probability_function(models[v], user_input_values)
  result.append(p)

top_viol = sorted(zip(violations, result), key=lambda p: p[1], reverse=True)[:4]

print("Top Most Likely Traffic Violations Based on Location")
print("\n")

for vio, prob in top_viol:
  print(f"{vio}: {prob:.2f}")


Enter your location details: Latitude and Longitude 

Top Most Likely Traffic Violations Based on Location


Accident: 0.03
Belts: 0.03
Property Damage: 0.02
Personal Injury: 0.01


Models Accuracy, Mean Absoute Error and Root Mean Squre Error

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

#training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=5)

#Setting the models

m1= RandomForestClassifier(n_estimators=10, random_state=5)
m2= GradientBoostingClassifier(n_estimators=10, learning_rate= 0.2)
m3= LogisticRegression(max_iter=10)

models_set = {
    "Random Forest Classifier":m1,
    "Gradient Boosting Classifier":m2,
    "Logistic Regression":m3
}

for v, violation in enumerate(violations):
    print(f"{violation}")
    
    for name, model in models_set.items():
        print(f"{name}")

        model.fit(X_train, y_train[:, v])

        m_pred = model.predict(X_test)

        model_accuracy = accuracy_score(y_test[:, v], m_pred)
        mae = mean_absolute_error(y_test[:, v], m_pred)
        rmse = np.sqrt(mean_squared_error(y_test[:, v], m_pred))

        print(f"Accuracy: {model_accuracy: .2f}")
        print(f"MAE:{mae: .2f}")
        print(f"RSME: {rmse: .2f}")
        print("\n")
    

Accident
Random Forest Classifier


ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets