In [None]:
#-----------------------------------------
# Title:  Decision Tree San Francisco Crime Classification Dataset
# Subtitle: DDS-8555, Assignment 8
# Author: Madgene Moise
# Date: Sunday, July 6, 2025
#-----------------------------------------

In [2]:
import pandas as pd

# Load the training and testing datasets

train_df = pd.read_csv("/kaggle/input/sf-crime/train.csv.zip") 
test_df = pd.read_csv("/kaggle/input/sf-crime/test.csv.zip")

# Display the first few rows of each dataset to understand the structure
train_head = train_df.head()
test_head = test_df.head()

train_df.shape, test_df.shape, train_df.columns, test_df.columns

((878049, 9),
 (884262, 7),
 Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
        'Resolution', 'Address', 'X', 'Y'],
       dtype='object'),
 Index(['Id', 'Dates', 'DayOfWeek', 'PdDistrict', 'Address', 'X', 'Y'], dtype='object'))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np

# Extract datetime features
train_df['Hour'] = pd.to_datetime(train_df['Dates']).dt.hour
train_df['Month'] = pd.to_datetime(train_df['Dates']).dt.month
train_df['Year'] = pd.to_datetime(train_df['Dates']).dt.year

# Select features
features = ['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Hour', 'Month', 'Year']
X = train_df[features]
y = train_df['Category']

# Encode categorical variables
X = X.copy()
le_day = LabelEncoder()
le_district = LabelEncoder()
X['DayOfWeek'] = le_day.fit_transform(X['DayOfWeek'])
X['PdDistrict'] = le_district.fit_transform(X['PdDistrict'])

# Encode the target
le_category = LabelEncoder()
y_encoded = le_category.fit_transform(y)

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Fit Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=20, min_samples_split=10)
dt_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = dt_model.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)

# Prepare a readable report
report_df = pd.DataFrame(report).transpose()

report_df.head()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.028986,0.013201,0.018141,303.0
1,0.167275,0.20852,0.185634,15375.0
2,0.076923,0.012346,0.021277,81.0
3,0.0,0.0,0.0,58.0
4,0.122153,0.088287,0.102495,7351.0


The Decision Tree Model was trained using:

* Features: DayOfWeek, PdDistrict, X, Y, Hour, Month, and Year
* Target: Category (crime type)
* Training/Validation Split: 80%/20% stratified split
* Tree Parameters = max_depth=20, min_samples_split=10

Classification Report Highlights:
* The overall accuracy varies substantially by class.
* Some classes, such as LARCENY/THEFT and ASSAULT (frequent crimes), achieve moderate precision and recall.
* Several rare classes (e.g., PORNOGRAPHY/OBSCENE MAT, TREA, etc.) show 0.00 for precision, recall, and F1-score -- indicating no correct predictions.

This is expected for an imbalanced classification problem with many (39 in total) and skewed distributions.

Assumption Checks and Model Behavior:

* No assumptions of normality or linearity are required for Decision Trees.
* However, performance is affected by:
    * Class imbalance (most crimes fall into a few categories)
    * High cardinality in categorical variables (like Address, which I excluded)
    * Noisy geolocation data: some values in Y are known to be erroneous (beyond San Francisco boundaries).

In [6]:
# Feature engineering
def add_time_features(df):
    df['Hour'] = pd.to_datetime(df['Dates']).dt.hour
    df['Month'] = pd.to_datetime(df['Dates']).dt.month
    df['Year'] = pd.to_datetime(df['Dates']).dt.year
    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# Sample 1% of the data stratified by Category
_, train_sampled_df = train_test_split(
    train_df,
    test_size=0.01,
    stratify=train_df['Category'],
    random_state=42
)

# Define features and target
features = ['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Hour', 'Month', 'Year']
target = 'Category'

# Label encoders
le_day = LabelEncoder()
le_district = LabelEncoder()
le_category = LabelEncoder()

# Prepare training data
X_train = train_sampled_df[features].copy()
X_train['DayOfWeek'] = le_day.fit_transform(X_train['DayOfWeek'])
X_train['PdDistrict'] = le_district.fit_transform(X_train['PdDistrict'])
y_train = le_category.fit_transform(train_sampled_df[target])

# Prepare test data
X_test = test_df[features].copy()
X_test['DayOfWeek'] = le_day.transform(X_test['DayOfWeek'])
X_test['PdDistrict'] = le_district.transform(X_test['PdDistrict'])

# Train Decision Tree model
clf = DecisionTreeClassifier(random_state=42, max_depth=20, min_samples_split=10)
clf.fit(X_train, y_train)

# Predict in batches
def batch_predict_proba(model, X, batch_size=10000):
    probs = []
    for start in range(0, len(X), batch_size):
        end = min(start + batch_size, len(X))
        batch = X.iloc[start:end]
        batch_probs = model.predict_proba(batch)
        probs.append(batch_probs)
    return np.vstack(probs)

# Run prediction
probs = batch_predict_proba(clf, X_test, batch_size=10000)

# Get all possible categories from the full training data
full_categories = sorted(train_df['Category'].unique())

# Initialize submission with zeros
submission_df = pd.DataFrame(0, index=np.arange(len(test_df)), columns=full_categories)
submission_df.insert(0, 'Id', test_df['Id'])

# Fill predicted probabilities into corresponding columns
seen_classes = le_category.classes_
probs_df = pd.DataFrame(probs, columns=seen_classes)
submission_df.loc[:, seen_classes] = probs_df.values

# Save submission
submission_df.to_csv('/kaggle/working/DecisionTree_submission_moisem.csv', index=False)
print("Submission file 'DecisionTree_submission_moisem.csv' created successfully.")

  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submission_df.loc[:, seen_classes] = probs_df.values
  submissi

Submission file 'DecisionTree_submission_moisem.csv' created successfully.
