In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Competition_Dataset.csv")

# Show the first few rows
df.head()


Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,Latitude (Y),Longitude (X)
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [3]:
# Check column types and non-null counts
df.info()

# Check for missing values
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680826 entries, 0 to 680825
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Dates          680826 non-null  object 
 1   Category       680826 non-null  object 
 2   Descript       680826 non-null  object 
 3   DayOfWeek      680826 non-null  object 
 4   PdDistrict     680826 non-null  object 
 5   Resolution     680826 non-null  object 
 6   Address        680826 non-null  object 
 7   Latitude (Y)   680826 non-null  float64
 8   Longitude (X)  680826 non-null  float64
dtypes: float64(2), object(7)
memory usage: 46.7+ MB


Dates            0
Category         0
Descript         0
DayOfWeek        0
PdDistrict       0
Resolution       0
Address          0
Latitude (Y)     0
Longitude (X)    0
dtype: int64

In [5]:
# How many unique crime categories?
df["Category"].value_counts()

# What are the most frequent police districts?
df["PdDistrict"].value_counts()

# What are the top crime descriptions?
df["Descript"].value_counts().head(10)


Descript
GRAND THEFT FROM LOCKED AUTO                 53984
ILLEGAL SUBSTANCES                           43090
LOST PROPERTY                                28547
DRIVERS LICENSE, SUSPENDED OR REVOKED        24173
STOLEN AUTOMOBILE                            24104
WARRANT ARREST                               21412
SUSPICIOUS OCCURRENCE                        19733
AIDED CASE, MENTAL DISTURBED                 19452
PETTY THEFT FROM LOCKED AUTO                 17768
MALICIOUS MISCHIEF, VANDALISM OF VEHICLES    16066
Name: count, dtype: int64

In [7]:
# Show the first few date values
df["Dates"].head()


0    2015-05-13 23:53:00
1    2015-05-13 23:53:00
2    2015-05-13 23:33:00
3    2015-05-13 23:30:00
4    2015-05-13 23:30:00
Name: Dates, dtype: object

In [9]:
# Convert Dates column to datetime format
df["Dates"] = pd.to_datetime(df["Dates"])

# Extract date-related features
df["Hour"] = df["Dates"].dt.hour
df["Month"] = df["Dates"].dt.month
df["Year"] = df["Dates"].dt.year

# Check the updated dataframe
df[["Dates", "Hour", "Month", "Year"]].head()


Unnamed: 0,Dates,Hour,Month,Year
0,2015-05-13 23:53:00,23,5,2015
1,2015-05-13 23:53:00,23,5,2015
2,2015-05-13 23:33:00,23,5,2015
3,2015-05-13 23:30:00,23,5,2015
4,2015-05-13 23:30:00,23,5,2015


In [11]:
# Drop columns we don’t need for now
df_cleaned = df.drop(columns=["Dates", "Descript", "Resolution", "Address"])

# Show the cleaned DataFrame
df_cleaned.head()


Unnamed: 0,Category,DayOfWeek,PdDistrict,Latitude (Y),Longitude (X),Hour,Month,Year
0,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599,23,5,2015
1,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599,23,5,2015
2,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414,23,5,2015
3,LARCENY/THEFT,Wednesday,NORTHERN,-122.426995,37.800873,23,5,2015
4,LARCENY/THEFT,Wednesday,PARK,-122.438738,37.771541,23,5,2015


In [13]:
# One-hot encode DayOfWeek and PdDistrict
df_encoded = pd.get_dummies(df_cleaned, columns=["DayOfWeek", "PdDistrict"])

# Show encoded dataset
df_encoded.head()


Unnamed: 0,Category,Latitude (Y),Longitude (X),Hour,Month,Year,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,WARRANTS,-122.425892,37.774599,23,5,2015,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,OTHER OFFENSES,-122.425892,37.774599,23,5,2015,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,OTHER OFFENSES,-122.424363,37.800414,23,5,2015,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,LARCENY/THEFT,-122.426995,37.800873,23,5,2015,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,LARCENY/THEFT,-122.438738,37.771541,23,5,2015,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [15]:
from sklearn.preprocessing import LabelEncoder

# Create encoder and apply to Category column
label_encoder = LabelEncoder()
df_encoded["CategoryEncoded"] = label_encoder.fit_transform(df_encoded["Category"])

# Optional: check encoded values
df_encoded[["Category", "CategoryEncoded"]].head()


Unnamed: 0,Category,CategoryEncoded
0,WARRANTS,23
1,OTHER OFFENSES,14
2,OTHER OFFENSES,14
3,LARCENY/THEFT,11
4,LARCENY/THEFT,11


In [19]:
# Drop Category and CategoryEncoded from X to avoid using the target
X = df_encoded.drop(columns=["Category", "CategoryEncoded"])

# Target column
y = df_encoded["CategoryEncoded"]

from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.3120235594788714

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.03      0.05       278
           1       0.00      0.00      0.00        73
           2       0.00      0.00      0.00        51
           3       0.16      0.08      0.11      6617
           4       0.07      0.04      0.06       780
           5       0.35      0.40      0.38      8961
           6       0.04      0.01      0.02       207
           7       0.06      0.02      0.03        45
           8       0.16      0.08      0.11      1761
           9       0.09      0.05      0.07      3008
          10       0.07      0.03      0.04       418
          11       0.39      0.60      0.47     31448
          12       0.58      0.47      0.52      4668
          13       0.24      0.20      0.22     16641
          14       0.27      0.34      0.30     22171
          15       0.09      0.02      0.03       565
          16       0.13    

In [34]:
# Find the top 5 crime categories
top5 = df_encoded["Category"].value_counts().head(5).index.tolist()
print("Top 5 Categories:", top5)

# Filter to only top 5 rows
df_top5 = df_encoded[df_encoded["Category"].isin(top5)]

# Reset index (not required but keeps it clean)
df_top5 = df_top5.reset_index(drop=True)

from sklearn.preprocessing import LabelEncoder

# Encode Category column again
label_encoder_top5 = LabelEncoder()
df_top5["CategoryEncoded"] = label_encoder_top5.fit_transform(df_top5["Category"])

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Features and target
X = df_top5.drop(columns=["Category", "CategoryEncoded"])
y = df_top5["CategoryEncoded"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
print("Accuracy on Top 5 Classes:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))



Top 5 Categories: ['LARCENY/THEFT', 'OTHER OFFENSES', 'NON-CRIMINAL', 'VEHICLE THEFT', 'DRUG/NARCOTIC']
Accuracy on Top 5 Classes: 0.47633565458043176

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.42      0.46      8961
           1       0.51      0.75      0.61     31448
           2       0.45      0.18      0.25     16641
           3       0.43      0.43      0.43     22171
           4       0.43      0.25      0.32      9668

    accuracy                           0.48     88889
   macro avg       0.46      0.41      0.41     88889
weighted avg       0.47      0.48      0.45     88889



In [40]:
# Severity categories from the challenge instructions
severity_1 = [
    "NON-CRIMINAL", "SUSPICIOUS OCCURRENCE", "MISSING PERSON", "RUNAWAY", "RECOVERED VEHICLE"
]

severity_2 = [
    "WARRANTS", "OTHER OFFENSES", "VANDALISM", "TRESPASS", "DISORDERLY CONDUCT", "BAD CHECKS"
]

severity_3 = [
    "LARCENY/THEFT", "VEHICLE THEFT", "FORGERY/COUNTERFEITING", "DRUG/NARCOTIC",
    "STOLEN PROPERTY", "FRAUD", "BRIBERY", "EMBEZZLEMENT"
]

severity_4 = [
    "ROBBERY", "WEAPON LAWS", "BURGLARY", "EXTORTION"
]

severity_5 = [
    "KIDNAPPING", "ARSON"
]

def assign_severity(category):
    if category in severity_1:
        return 1
    elif category in severity_2:
        return 2
    elif category in severity_3:
        return 3
    elif category in severity_4:
        return 4
    elif category in severity_5:
        return 5
    else:
        return 0  # Unknown or unlisted category

df_encoded["Severity"] = df_encoded["Category"].apply(assign_severity)
df_encoded[["Category", "Severity"]].head()

df_top5["Severity"] = df_top5["Category"].apply(assign_severity)
df_top5[["Category", "Severity"]].head()



Unnamed: 0,Category,Severity
0,OTHER OFFENSES,2
1,OTHER OFFENSES,2
2,LARCENY/THEFT,3
3,LARCENY/THEFT,3
4,LARCENY/THEFT,3


In [44]:
# Drop the columns we don't want to use as input
X = df_top5.drop(columns=["Category", "CategoryEncoded", "Severity"])

# Target column is Severity
y = df_top5["Severity"] - 1

from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train the model
severity_model = XGBClassifier(eval_metric='mlogloss')
severity_model.fit(X_train, y_train)

# Predict
y_pred = severity_model.predict(X_test)

# Evaluate
print("Severity Prediction Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))


Severity Prediction Accuracy: 0.5917942602571746

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.11      0.18     16641
           1       0.52      0.22      0.30     22172
           2       0.60      0.92      0.73     50076

    accuracy                           0.59     88889
   macro avg       0.55      0.41      0.40     88889
weighted avg       0.57      0.59      0.52     88889



In [48]:
!pip install folium

import folium

def get_severity_color(severity):
    if severity == 0:
        return 'blue'
    elif severity == 1:
        return 'green'
    elif severity == 2:
        return 'red'
    else:
        return 'gray'

# Sample 500
sample = df_top5.sample(500, random_state=42)

m = folium.Map(location=[37.77, -122.42], zoom_start=12)

for _, row in sample.iterrows():
    folium.CircleMarker(
        location=[row["Latitude (Y)"], row["Longitude (X)"]],
        radius=4,
        popup=f"{row['Category']} (Severity {row['Severity']})",
        color=get_severity_color(row["Severity"]),
        fill=True,
        fill_color=get_severity_color(row["Severity"]),
        fill_opacity=0.7
    ).add_to(m)

m


