In [None]:
#-----------------------------------------
# Title:  Support Vector Machine (SVM) San Francisco Crime Classification Dataset
# Subtitle: DDS-8555, Assignment 8
# Author: Madgene Moise
# Date: Sunday, July 6, 2025
#-----------------------------------------

In [2]:
import pandas as pd

# Load the training and testing datasets

train_df = pd.read_csv("/kaggle/input/sf-crime/train.csv.zip") 
test_df = pd.read_csv("/kaggle/input/sf-crime/test.csv.zip")

# Display the first few rows of each dataset to understand the structure
train_head = train_df.head()
test_head = test_df.head()

train_df.shape, test_df.shape, train_head, test_head

((878049, 9),
 (884262, 7),
                  Dates        Category                      Descript  \
 0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
 1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
 2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
 3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
 4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
 
    DayOfWeek PdDistrict      Resolution                    Address  \
 0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
 1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
 2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
 3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
 4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   
 
             X          Y  
 0 -122.425892  37.774599  
 1 -122.425892  37.774599  
 2 -122.424363  37.8

The training and test datasets from the San Francisco Crime Classification competition are structured as follows:

Training Set Summary
* Observations: 878,049 rows
* Columns: 9 variables
  * Dates, Category, Descript, DayOfWeek, PdDistrict, Resolution, Address, X, Y
 
Test Set Summary
* Observations: 884,262 rows
* Columns: 7 variables (no Category, Descript, or Resolution)
  * Id, Dates, DayOfWeek, PdDistrict, Address, X, Y

The target variable is Category, which I will classify using a Support Vector Machine (SVM).

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Sample 10% of the training data stratified by Category to balance classes
train_sample, _ = train_test_split(
    train_df, test_size=0.90, stratify=train_df['Category'], random_state=42
)

In [4]:
# Extract basic datetime features
def preprocess_dates(df):
    df['Dates'] = pd.to_datetime(df['Dates'])
    df['Hour'] = df['Dates'].dt.hour
    df['Month'] = df['Dates'].dt.month
    df['Year'] = df['Dates'].dt.year
    df['Day'] = df['Dates'].dt.day
    df['Weekday'] = df['Dates'].dt.weekday
    return df

train_sample = preprocess_dates(train_sample)


In [5]:
# Encode categorical variables
label_encoders = {}
for col in ['DayOfWeek', 'PdDistrict']:
    le = LabelEncoder()
    train_sample[col] = le.fit_transform(train_sample[col])
    label_encoders[col] = le


In [6]:
# Encode target variable
target_encoder = LabelEncoder()
train_sample['Category'] = target_encoder.fit_transform(train_sample['Category'])

# Select features and target
X = train_sample[['Hour', 'Month', 'Year', 'Day', 'Weekday', 'DayOfWeek', 'PdDistrict', 'X', 'Y']]
y = train_sample['Category']

X.shape, y.shape, y.nunique()  # Number of classes in target variable

((87804, 9), (87804,), 39)

I have a stratified 10% sample of the training data with the following characteristics:

* Observations: 87,804

* Features Used: 9
  * Temporal: Hour, Month, Year, Day, Weekday
  * Categorical (encoded): DayOfWeek, PdDistrict
  * Spatial: X, Y

* Target Variable: Category (encoded into 39 unique crime categories) 

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Scale the features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train a linear SVM classifier (using linear kernel for interpretability)
svm_clf = SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)
svm_clf.fit(X_train_scaled, y_train)

# Filter out rare classes with fewer than 5 occurrences
category_counts = train_sample['Category'].value_counts()
valid_categories = category_counts[category_counts >= 5].index
filtered_sample = train_sample[train_sample['Category'].isin(valid_categories)]

# Redefine X and y with the filtered data
X_filtered = filtered_sample[['Hour', 'Month', 'Year', 'Day', 'Weekday', 'DayOfWeek', 'PdDistrict', 'X', 'Y']]
y_filtered = filtered_sample['Category']

# Split again
X_train, X_val, y_train, y_val = train_test_split(
    X_filtered, y_filtered, test_size=0.2, stratify=y_filtered, random_state=42
)

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train SVM again
svm_clf.fit(X_train_scaled, y_train)
y_pred = svm_clf.predict(X_val_scaled)

# Classification report
report = classification_report(y_val, y_pred, target_names=target_encoder.inverse_transform(np.unique(y_val)))

report[:1500]  # Trimmed for readability

**Support Vector Machine (SVM) Classification**
* Model: Linear kernel SVM (c=1.0, One-vs-Rest)
* Classes Used: A subset of 39 crime categories (excluding those with <5 samples)
* Evaluation: 20% holdout validation set.
* Features Used: Date/time, location (X, Y), district, and encoded day-of-week

**Results**

Precision and recall vary considerably across categories because common crimes like LARCENY/THEFT, DRUG/NARCOTIC, and VEHICLE THEFT show moderate precision (~0.4 - 0.6) and recall. The rare or ambiguous categories (e.g., TREA, SEX OFFENSES FORCIBLE) typically have low recall due to underrepresentation.

The F1-score is skewed toward dominant categories because SVM struggles with an imbalanced class distribution.

**Assumption Checks**

* Linearity: SVM with a linear kernel assumes linear class separation. The given crime data is complex; this is likely a weak assumption.
* Feature Scaling: Addressed using StandardScaler, which is important due to spatial features.
* Class Balance: Still skewed - SVMs are sensitive to imbalanced classes. I need to consider class weights or sampling methods to address this.

In [10]:
# Generate Submission Locally 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC

# File paths
OUTPUT_PATH = "/kaggle/working/svm_submission_moisem.csv"
BATCH_SIZE = 100000  # Controls memory usage

# Preprocess datetime features
def preprocess_dates(df):
    df['Dates'] = pd.to_datetime(df['Dates'])
    df['Hour'] = df['Dates'].dt.hour
    df['Month'] = df['Dates'].dt.month
    df['Year'] = df['Dates'].dt.year
    df['Day'] = df['Dates'].dt.day
    df['Weekday'] = df['Dates'].dt.weekday
    return df

train_df = preprocess_dates(train_df)
test_df = preprocess_dates(test_df)

# Stratified 1% sample
train_sample, _ = train_test_split(
    train_df, test_size=0.99, stratify=train_df['Category'], random_state=42
)

# Encode categorical variables
label_encoders = {}
for col in ['DayOfWeek', 'PdDistrict']:
    le = LabelEncoder()
    combined = pd.concat([train_sample[col].astype(str), test_df[col].astype(str)], axis=0)
    le.fit(combined)
    train_sample[col] = le.transform(train_sample[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))
    label_encoders[col] = le

# Encode target variable
target_encoder = LabelEncoder()
train_sample['Category'] = target_encoder.fit_transform(train_sample['Category'])

# Remove rare classes (fewer than 5 samples)
category_counts = train_sample['Category'].value_counts()
valid_classes = category_counts[category_counts >= 5].index
train_sample = train_sample[train_sample['Category'].isin(valid_classes)]

# Feature selection
features = ['Hour', 'Month', 'Year', 'Day', 'Weekday', 'DayOfWeek', 'PdDistrict', 'X', 'Y']
X = train_sample[features]
y = train_sample['Category']

# Train-test split for training
X_train, _, y_train, _ = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train SVM
svm_clf = SVC(kernel='linear', C=1.0, decision_function_shape='ovr')
svm_clf.fit(X_train_scaled, y_train)

# Predict test set in batches
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / e_x.sum(axis=1, keepdims=True)

batch_probs = []

for i in range(0, test_df.shape[0], BATCH_SIZE):
    batch = test_df.iloc[i:i+BATCH_SIZE]
    X_batch = batch[features]
    X_batch_scaled = scaler.transform(X_batch)
    decision_scores = svm_clf.decision_function(X_batch_scaled)
    probs = softmax(decision_scores)
    batch_probs.append(probs)

probs_combined = np.vstack(batch_probs)

# Build Full 39-Class Submission Matrix
all_labels = list(target_encoder.classes_)
predicted_labels = target_encoder.inverse_transform(np.unique(y_train))
full_probs = np.zeros((probs_combined.shape[0], len(all_labels)))
label_index_map = {label: i for i, label in enumerate(all_labels)}

for idx, label in enumerate(predicted_labels):
    full_index = label_index_map[label]
    full_probs[:, full_index] = probs_combined[:, idx]

submission_df = pd.DataFrame(full_probs, columns=all_labels)
submission_df.insert(0, 'Id', test_df['Id'])

# Fix Missing Columns by Adding Zeros 
required_columns = ['Id'] + list(target_encoder.classes_)
for col in target_encoder.classes_:
    if col not in submission_df.columns:
        submission_df[col] = 0.0

submission_df = submission_df[required_columns]  # Ensure correct column order

# Save to CSV
submission_df.to_csv(OUTPUT_PATH, index=False)
print(f"Submission file saved to: {OUTPUT_PATH}")

Submission file saved to: /kaggle/working/svm_submission_moisem.csv
