In [1]:
import pandas as pd

df = pd.read_csv("Traffic_Violations.csv")

# Standardizeing column name
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [2]:
# Converting date and time
df['date_of_stop'] = pd.to_datetime(df['date_of_stop'], errors='coerce')
df['time_of_stop'] = pd.to_datetime(df['time_of_stop'], format='%H:%M:%S', errors='coerce').dt.time

# Extracting the features for forecasting
df['hour'] = pd.to_datetime(df['time_of_stop'], errors='coerce').dt.hour
df['day_of_week'] = df['date_of_stop'].dt.day_name()
df['month'] = df['date_of_stop'].dt.month

# Converting coordinates to numeric for mapping
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Convert the Yes/No or True/False columns to Boolean
bool_cols = ['accident', 'belts', 'personal_injury', 'property_damage', 'fatal',
             'commercial_license', 'hazmat', 'commercial_vehicle', 'alcohol', 'work_zone',
             'search_conducted', 'search_person', 'search_vehicle', 'contraband_found',
             'attributed_to_accident']
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map({'yes': True, 'no': False, 'true': True, 'false': False})


# Filling in missing categorical values
fill_cols = ['gender', 'race', 'driver_city', 'driver_state', 'vehicle_type', 'make', 'model', 'arrest_type']
for col in fill_cols:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Dropping invalid or missing essential values
df.dropna(subset=['date_of_stop', 'latitude', 'longitude'], inplace=True)
df.drop_duplicates(inplace=True)


In [3]:

df.to_csv("cleaned_traffic_violations.csv", index=False)
print("Cleaned dataset saved as 'cleaned_traffic_violations.csv'")

Cleaned dataset saved as 'cleaned_traffic_violations.csv'


In [4]:
df2 = pd.read_csv("cleaned_traffic_violations.csv")

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994381 entries, 0 to 1994380
Data columns (total 46 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   seqid                    object 
 1   date_of_stop             object 
 2   time_of_stop             object 
 3   agency                   object 
 4   subagency                object 
 5   description              object 
 6   location                 object 
 7   latitude                 float64
 8   longitude                float64
 9   accident                 bool   
 10  belts                    bool   
 11  personal_injury          bool   
 12  property_damage          bool   
 13  fatal                    bool   
 14  commercial_license       bool   
 15  hazmat                   bool   
 16  commercial_vehicle       bool   
 17  alcohol                  bool   
 18  work_zone                bool   
 19  search_conducted         object 
 20  search_disposition       object 
 21  search_o

In [6]:
df2.head()

Unnamed: 0,seqid,date_of_stop,time_of_stop,agency,subagency,description,location,latitude,longitude,accident,...,race,gender,driver_city,driver_state,dl_state,arrest_type,geolocation,hour,day_of_week,month
0,52282e8c-f2e1-4bb5-8509-2d5e4f8da8ca,2023-05-01,23:11:00,MCP,"3rd District, Silver Spring",OPERATING UNREGISTERED MOTOR VEHICLE ON HIGHWAY,BRIGGS CHANEY RD @ COLUMIBA PIKE,0.0,0.0,False,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Monday,5
1,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",FAILURE TO DISPLAY REGISTRATION CARD UPON DEMA...,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
2,9be35886-e00c-49c2-8f27-2f6307696a17,2023-11-25,00:20:00,MCP,"6th District, Gaithersburg / Montgomery Village",DISPLAYING EXPIRED REGISTRATION PLATE ISSUED B...,GEORGIA AVE / WEISMAN RD,39.052962,-77.051304,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0529625, -77.0513041666667)",,Saturday,11
3,4d37fa99-0df3-4a56-9ba6-692bce894a34,2023-11-26,09:16:00,MCP,"4th District, Wheaton",DRIVING VEHICLE WHILE UNDER THE INFLUENCE OF A...,3803 WELLER RD,39.058378,-77.049652,False,...,HISPANIC,M,SILVER SPRING,MD,MD,A - Marked Patrol,"(39.0583783333333, -77.0496516666667)",,Sunday,11
4,3a723e9a-5dc0-4bc3-9bd9-4555d6ce0e49,2023-11-25,05:45:00,MCP,"4th District, Wheaton",RECKLESS DRIVING VEHICLE IN WANTON AND WILLFUL...,OLNEY LAYTONSVILLE RD @ FIELDCREST RD,0.0,0.0,False,...,WHITE,M,GAITHERSBURG,MD,MD,A - Marked Patrol,"(0.0, 0.0)",,Saturday,11


In [7]:
#Extract day name from date
day_map = {'Monday':0,'Tuesday':1,'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

df2['day_of_week'] = df2['day_of_week'].map(day_map)

In [8]:
from sklearn.cluster import KMeans

# Cluster locations into 50 zones
coords = df2[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=50, random_state=42).fit(coords)
df2['location_cluster'] = kmeans.labels_

#### This line extracts the hour from the time_of_stop column, which is assumed to be in the format 'HH:MM:SS'

In [9]:
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Show full cell content

In [10]:
df2['hour'] = pd.to_datetime(df2['time_of_stop'],format='%H:%M:%S', errors='coerce').dt.hour

In [11]:
# Create high-risk labels using day, hour, and month
risk_counts = df2.groupby(['day_of_week', 'hour', 'month']).size().reset_index(name='violation_count')

# Define a threshold for high-risk (e.g., top 25% of counts)
threshold = risk_counts['violation_count'].quantile(0.75)
risk_counts['high_risk'] = (risk_counts['violation_count'] >= threshold).astype(int)

# Merge back to original dataframe
df2 = df2.merge(risk_counts[['day_of_week', 'hour', 'month', 'high_risk']],
                on=['day_of_week', 'hour', 'month'], how='left')


In [12]:
features = ['location_cluster', 'longitude', 'latitude', 'month']
X = df2[features]
y = df2['high_risk']

In [13]:
df2.groupby('location_cluster')['high_risk'].mean().sort_values(ascending=False)


location_cluster
44    1.000000
2     0.893939
17    0.777778
34    0.538462
41    0.482726
37    0.466746
12    0.465659
3     0.458571
26    0.456981
46    0.454545
40    0.453505
47    0.445512
35    0.444749
23    0.442311
32    0.440725
8     0.433952
21    0.433618
0     0.432421
14    0.431979
30    0.430945
16    0.428830
15    0.424545
10    0.421265
39    0.419011
36    0.417910
13    0.412715
38    0.409602
43    0.409310
11    0.408421
45    0.405747
27    0.405042
24    0.403700
29    0.394087
28    0.392704
1     0.389512
48    0.389113
18    0.384386
20    0.383586
22    0.380618
42    0.377047
49    0.374569
33    0.363629
4     0.362395
19    0.353578
31    0.280967
25    0.133333
5     0.000000
6     0.000000
7     0.000000
9     0.000000
Name: high_risk, dtype: float64

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from datetime import datetime

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
model.fit(X_train, y_train)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
# Make predictions with the best model
y_pred = model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[246975  43423]
 [147452  60746]]
              precision    recall  f1-score   support

           0       0.63      0.85      0.72    290398
           1       0.58      0.29      0.39    208198

    accuracy                           0.62    498596
   macro avg       0.60      0.57      0.56    498596
weighted avg       0.61      0.62      0.58    498596



In [16]:
print("Features used:", X.columns.tolist())

Features used: ['location_cluster', 'longitude', 'latitude', 'month']


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    # Predict labels and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # Print or return
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"AUC-ROC: {auc:.2f}")

In [18]:
# In[20]:

# Evaluate metrics of the best model
evaluate_model(model, X_test, y_test)

Accuracy: 0.62
Precision: 0.58
Recall: 0.29
F1 Score: 0.39
AUC-ROC: 0.63


In [19]:

# def get_high_risk_locations(model, day_name, hour_input, month_input, df, top_n=10):
#     day_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
#                'Friday': 4, 'Saturday': 5, 'Sunday': 6}
#     day_num = day_map[day_name]

#     # Prepare input data
#     location_df = df[['location', 'location_cluster', 'latitude', 'longitude']].drop_duplicates()

#     location_df['day_of_week'] = day_num
#     location_df['hour'] = hour_input
#     location_df['month'] = month_input

#     input_features = ['location_cluster', 'longitude', 'latitude', 'month']
#     X_input = location_df[input_features]

#     location_df['risk_probability'] = model.predict_proba(X_input)[:, 1]

#     return location_df.sort_values(by='risk_probability', ascending=False).head(top_n)[
#         ['location', 'latitude', 'longitude', 'risk_probability']
#     ]


In [20]:
# top_locations = get_high_risk_locations(model=model,
#                                         day_name="Tuesday",
#                                         hour_input=17,
#                                         month_input=7,  # July
#                                         df=df2,
#                                         top_n=10)
# top_locations

In [21]:
import joblib
joblib.dump(model, 'model.pkl')


['model.pkl']

In [22]:
df2.to_csv("cleaned_traffic_violations1.csv", index=False)