In [1]:
import pandas as pd

df = pd.read_csv("Traffic_Violations.csv")

# Standardizeing column name
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [2]:
# Converting date and time
df['date_of_stop'] = pd.to_datetime(df['date_of_stop'], errors='coerce')
df['time_of_stop'] = pd.to_datetime(df['time_of_stop'], format='%H:%M:%S', errors='coerce').dt.time

# Extracting the features for forecasting
df['hour'] = pd.to_datetime(df['time_of_stop'], errors='coerce').dt.hour
df['day_of_week'] = df['date_of_stop'].dt.day_name()
df['month'] = df['date_of_stop'].dt.month

# Converting coordinates to numeric for mapping
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Convert the Yes/No or True/False columns to Boolean
bool_cols = ['accident', 'belts', 'personal_injury', 'property_damage', 'fatal',
             'commercial_license', 'hazmat', 'commercial_vehicle', 'alcohol', 'work_zone',
             'search_conducted', 'search_person', 'search_vehicle', 'contraband_found',
             'attributed_to_accident']
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})


# Filling in missing categorical values
fill_cols = ['gender', 'race', 'driver_city', 'driver_state', 'vehicle_type', 'make', 'model', 'arrest_type']
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Dropping invalid or missing essential values
df.dropna(subset=['date_of_stop', 'latitude', 'longitude'], inplace=True)
df.drop_duplicates(inplace=True)


In [3]:

df.to_csv("cleaned_traffic_violations.csv", index=False)
print("Cleaned dataset saved as 'cleaned_traffic_violations.csv'")

Cleaned dataset saved as 'cleaned_traffic_violations.csv'


In [4]:
df2 = pd.read_csv("cleaned_traffic_violations.csv")

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994381 entries, 0 to 1994380
Data columns (total 46 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   seqid                    object 
 1   date_of_stop             object 
 2   time_of_stop             object 
 3   agency                   object 
 4   subagency                object 
 5   description              object 
 6   location                 object 
 7   latitude                 float64
 8   longitude                float64
 9   accident                 bool   
 10  belts                    bool   
 11  personal_injury          bool   
 12  property_damage          bool   
 13  fatal                    bool   
 14  commercial_license       bool   
 15  hazmat                   bool   
 16  commercial_vehicle       bool   
 17  alcohol                  bool   
 18  work_zone                bool   
 19  search_conducted         object 
 20  search_disposition       object 
 21  search_o

In [6]:
df2.head()

Unnamed: 0,seqid,date_of_stop,time_of_stop,agency,subagency,description,location,latitude,longitude,accident,...,race,gender,driver_city,driver_state,dl_state,arrest_type,geolocation,hour,day_of_week,month
0,52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca,2023-05-01,23:11:00,MCP,"3rd District, Silver Spring",OPERATING UNREGISTERED MOTOR VEHICLE ON HIGHWAY,BRIGGS CHANEY RD @ COLUMIBA PIKE,0.0,0.0,False,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Monday,5
1,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO DISPLAY REGISTRATION CARD UPON DEMA...,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
2,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",DISPLAYING EXPIRED REGISTRATION PLATE ISSUED B...,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
3,4d37fa99-0df3-4a56-9ba6-692bce894a34,2023-11-26,09:16:00,MCP,"4th District, Wheaton",DRIVING VEHICLE WHILE UNDER THE INFLUENCE OF A...,3803 WELLER RD,39.058378,-77.049652,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0583783333333, -77.0496516666667)",,Sunday,11
4,3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49,2023-11-25,05:45:00,MCP,"4th District, Wheaton",RECKLESS DRIVING VEHICLE IN WANTON AND WILLFUL...,OLNEY LAYTONSVILLE RD @ FIELDCREST RD,0.0,0.0,False,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Saturday,11


In [7]:
#Extract day name from date
day_map = {'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

df2['day_of_week'] = df2['day_of_week'].map(day_map)


In [8]:
from sklearn.cluster import KMeans

# Cluster locations into 50 zones
coords = df2[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=50, random_state=42).fit(coords)
df2['location_cluster'] = kmeans.labels_

#### This line extracts the hour from the time_of_stop column, which is assumed to be in the format 'HH:MM:SS'

In [10]:
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Show full cell content

In [11]:
df2['hour'] = pd.to_datetime(df2['time_of_stop'],format='%H:%M:%S', errors='coerce').dt.hour

In [12]:
risk_counts = df2.groupby(['day_of_week','hour']).size().reset_index(name='violation_count')

threshold = risk_counts['violation_count'].quantile(0.75)

risk_counts['high_risk'] = (risk_counts['violation_count'] >= threshold).astype(int)

In [13]:
df2 = df2.merge(risk_counts[['day_of_week','hour','high_risk']], on = ['day_of_week','hour'], how = 'left')

In [14]:
df2

Unnamed: 0,seqid,date_of_stop,time_of_stop,agency,subagency,description,location,latitude,longitude,accident,belts,personal_injury,property_damage,fatal,commercial_license,hazmat,commercial_vehicle,alcohol,work_zone,search_conducted,search_disposition,search_outcome,search_reason,search_reason_for_stop,search_type,search_arrest_reason,state,vehicletype,year,make,model,color,violation_type,charge,article,contributed_to_accident,race,gender,driver_city,driver_state,dl_state,arrest_type,geolocation,hour,day_of_week,month,location_cluster,high_risk
0,52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca,2023-05-01,23:11:00,MCP,"3rd District, Silver Spring",OPERATING UNREGISTERED MOTOR VEHICLE ON HIGHWAY,BRIGGS CHANEY RD @ COLUMIBA PIKE,0.000000,0.000000,False,False,False,False,False,False,False,False,False,False,False,,Citation,,17-107(a1),,,MD,02 - Automobile,2007.0,CHEV,CRUZ,BLACK,Citation,13-401(b1),Transportation Article,False,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",23,0,5,1,1
1,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO DISPLAY REGISTRATION CARD UPON DEMAND BY POLICE OFFICER,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,False,False,False,False,False,False,False,False,False,False,,Citation,,13-411(f),,,MD,02 - Automobile,2013.0,TOYOTA,COROLLA,RED,Citation,13-409(b),Transportation Article,False,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",0,5,11,33,1
2,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,False,False,False,False,False,False,False,False,False,False,,Citation,,13-411(f),,,MD,02 - Automobile,2013.0,TOYOTA,COROLLA,RED,Citation,13-411(f),Transportation Article,False,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",0,5,11,33,1
3,4d37fa99-0df3-4a56-9ba6-692bce894a34,2023-11-26,09:16:00,MCP,"4th District, Wheaton",DRIVING VEHICLE WHILE UNDER THE INFLUENCE OF ALCOHOL,3803 WELLER RD,39.058378,-77.049652,False,False,False,True,False,False,False,False,False,False,True,Property Only,Arrest,Incident to Arrest,20-103(b),Both,Stop,MD,02 - Automobile,2004.0,TOYOTA,HB,SILVER,Citation,21-902(a1i),Transportation Article,False,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0583783333333, -77.0496516666667)",9,6,11,33,0
4,3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49,2023-11-25,05:45:00,MCP,"4th District, Wheaton",RECKLESS DRIVING VEHICLE IN WANTON AND WILLFUL DISREGARD FOR SAFETY OF PERSONS AND PROPERTY,OLNEY LAYTONSVILLE RD @ FIELDCREST RD,0.000000,0.000000,False,False,False,False,False,False,False,False,False,False,,,,,,,,MD,02 - Automobile,2022.0,MITSUBISHI,ECLIPSE CROSS,BLACK,Citation,21-901.1(a),Transportation Article,False,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",5,5,11,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994376,7443fc19-bb22-4d8b-a73d-61325e6c6269,2016-03-04,20:11:00,MCP,"3rd District, Silver Spring",TAG LIGHTS (*),SLIGO AVE. @ FENTON ST.,38.990353,-77.024982,False,False,False,False,False,False,False,False,False,False,False,,SERO,,65*,,,MD,02 - Automobile,2005.0,CHEV,TRUCK,WHITE,ESERO,65*,,False,HISPANIC,F,SILVER SPRING,MD,MD,A - Marked Patrol,"(38.9903533333333, -77.0249816666667)",20,4,3,16,0
1994377,15046153-2c14-4c90-8fba-05cc3e178079,2018-11-14,23:21:00,MCP,"5th District, Germantown",HEADLIGHTS (*),DARNESTOWN RD @TRAVILLE GATEWAY DR,39.095365,-77.199667,False,False,False,False,False,False,False,False,False,False,,,,,,,,MD,02 - Automobile,2009.0,HONDA,FIT,SILVER,ESERO,55*,,False,WHITE,F,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.095365, -77.1996666666667)",23,2,11,24,1
1994378,9ab3f931-1506-400c-b23b-0f1fc567cc89,2019-05-21,22:06:00,MCP,"3rd District, Silver Spring",HEADLIGHTS (*),NEW HAMPSHIRE/VENICE DR,0.000000,0.000000,False,False,False,False,False,False,False,False,False,False,,,,,,,,MD,02 - Automobile,1991.0,TOYOTA,4DR,BLUE,ESERO,55*,,False,HISPANIC,M,HYATTSVILLE,MD,MD,A - Marked Patrol,"(0.0, 0.0)",22,1,5,1,1
1994379,ae19cf7b-3eef-4341-b370-04e5f1a7a558,2017-07-02,13:19:00,MCP,"6th District, Gaithersburg / Montgomery Village",STOP LIGHTS (*),MIDCOUNTY HWY @ WASHINGTON GROVE,39.143907,-77.162000,False,False,False,False,False,False,False,False,False,False,False,,SERO,,64*,,,MD,02 - Automobile,2004.0,TOYOTA,SIENNA,GOLD,ESERO,64*,,False,WHITE,F,GAITHERSBURG,MD,MD,A - Marked Patrol,"(39.1439066666667, -77.162)",13,6,7,32,0


In [15]:
features = ['location_cluster','longitude','latitude']
X = df2[features]
y = df2['high_risk']

In [16]:
df2.groupby('location_cluster')['high_risk'].mean().sort_values(ascending=False)

location_cluster
44    1.000000
2     0.818182
17    0.777778
34    0.538462
37    0.457120
41    0.455163
46    0.454545
26    0.447861
12    0.443820
35    0.443020
3     0.440590
40    0.434184
23    0.429413
47    0.429356
32    0.423567
30    0.420828
14    0.420769
0     0.418643
8     0.417677
16    0.416256
15    0.413999
39    0.409866
38    0.406987
36    0.406524
21    0.404479
27    0.401460
10    0.399664
43    0.394162
11    0.391240
13    0.389411
24    0.387758
1     0.387394
29    0.384954
45    0.382265
4     0.369889
22    0.367449
42    0.366032
28    0.365088
20    0.362975
49    0.355931
33    0.355043
18    0.353194
48    0.344758
19    0.337041
31    0.274924
25    0.133333
9     0.000000
7     0.000000
6     0.000000
5     0.000000
Name: high_risk, dtype: float64

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from datetime import datetime

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

model = RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(X_train,y_train)

end_time = datetime.now()
print(f"Model finished training at {end_time.strftime('%Y-%m-%d %H:%M:%S')}")

Model finished training at 2025-04-07 08:51:31


In [18]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[206952  29974]
 [ 47743 114208]]
              precision    recall  f1-score   support

           0       0.81      0.87      0.84    236926
           1       0.79      0.71      0.75    161951

    accuracy                           0.81    398877
   macro avg       0.80      0.79      0.79    398877
weighted avg       0.80      0.81      0.80    398877



In [19]:
print("Features used:", X.columns.tolist())

Features used: ['location_cluster', 'longitude', 'latitude']


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    # Predict labels and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Print or return
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"AUC-ROC: {auc:.2f}")

In [21]:
evaluate_model(model, X_test, y_test)

Accuracy: 0.81
Precision: 0.79
Recall: 0.71
F1 Score: 0.75
AUC-ROC: 0.88


In [26]:
def get_high_risk_locations(model, day_name, hour_input, df, top_n=10):
    # Convert day name to number
    day_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
               'Friday': 4, 'Saturday': 5, 'Sunday': 6}
    day_num = day_map[day_name]

    # Prepare input data
    location_df = df[['location', 'location_cluster', 'latitude', 'longitude']].drop_duplicates()

    location_df['day_of_week'] = day_num
    location_df['hour'] = hour_input

    input_features = ['location_cluster', 'longitude', 'latitude']
    X_input = location_df[input_features]

    location_df['risk_probability'] = model.predict_proba(X_input)[:, 1]

    return location_df.sort_values(by='risk_probability', ascending=False).head(top_n)[
        ['location', 'latitude', 'longitude', 'risk_probability']
    ]

In [28]:
top_locations = get_high_risk_locations(model=model,day_name="Tuesday",hour_input= 17,df=df2, top_n=10)

top_locations

Unnamed: 0,location,latitude,longitude,risk_probability
1636294,UNIVERSITY BLVD @ KING GEORGE,39.03852,-77.041368,1.0
1560106,LEXINGTON ST AT PERRY AVE,39.032012,-77.072352,1.0
1776694,I-270 NB EXIT RAMP/TOWER OAKS BLVD,39.05678,-77.150224,1.0
686849,SHADY GROVE RD @ GAITHER RD,39.117655,-77.184462,1.0
435524,1ST ST / NORBECK RD,39.085727,-77.135848,1.0
915607,LN 1 NB 355 @ 124,39.152127,-77.211228,1.0
824580,DARNESTOWN RD @ WHITE GROUND RD,39.137292,-77.344168,1.0
762095,BROSCHART RD @ KEY WEST AVE,39.105928,-77.20183,1.0
1590598,SECOND AVE @ 16TH ST,39.001925,-77.038827,1.0
686856,EAST MELBOURNE AVE ON UNIVERSITY BLVD,39.010081,-77.000665,1.0
