# Import Dependencies
- Import common libraries for data analysis, visualization, and modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.tree import plot_tree

# Data Loading and Preview
- Load `sf-crime/train.csv`
- Preview the first rows and summarize missing values

In [9]:

df = pd.read_csv('sf-crime/train.csv')

# A. Data Understanding and Preprocessing

# 1. Check the first few rows of the dataset
print('First few rows:')
print(df.head())

# 2. Check for missing values
print('Missing values:')
print(df.isnull().sum())

First few rows:
                 Dates        Category                      Descript  \
0  2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1  2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2  2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4  2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  
0 -122.425892  37.774599  
1 -122.425892  37.774599  
2 -122.424363  37.800414  
3 -122.426995  37.8008

# Data Cleaning
- Parse and clean the date column (drop unparseable records)
- Remove duplicate rows
- Filter out invalid coordinates (keep within San Francisco bounds)
- Print dataset shape after cleaning

In [10]:
# 3. Basic structure and types
print('Shape:', df.shape)
print(df.dtypes)

# 4. Parse date column and handle invalid values
date_col = 'Dates' if 'Dates' in df.columns else None
if date_col:
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    invalid_dates = df[date_col].isna().sum()
    if invalid_dates:
        print(f'Invalid dates: {invalid_dates} — dropping them')
        df = df[df[date_col].notna()].copy()
else:
    print('No date column found')

# 5. Drop exact duplicates
dup_count = df.duplicated().sum()
print('Duplicate rows:', dup_count)
if dup_count:
    df = df.drop_duplicates().copy()

# 6. Sanity check coordinates (keep plausible SF bounds)
if {'X','Y'}.issubset(df.columns):
    before = len(df)
    df = df[df['Y'].between(37.0, 38.0) & df['X'].between(-123.0, -121.0)].copy()
    print(f'Removed {before - len(df)} rows outside SF bounds')

print('Cleaned shape:', df.shape)


Shape: (878049, 9)
Dates          object
Category       object
Descript       object
DayOfWeek      object
PdDistrict     object
Resolution     object
Address        object
X             float64
Y             float64
dtype: object
Duplicate rows: 2323
Removed 67 rows outside SF bounds
Cleaned shape: (875659, 9)


# Feature Engineering and Encoding
- Construct Arrest_Indicator, Year/Month/Day/Hour, Incident_Quarter
- One-hot encode DayOfWeek, PdDistrict, Resolution; label-encode Category if present
- Example filters: create subsets by year (e.g., 2015) and district (e.g., Mission)

In [11]:
# 7. Derived variables: Arrest Indicator, Incident Quarter, calendar parts

if 'Resolution' in df.columns:

    df['Arrest_Indicator'] = df['Resolution'].str.contains('ARREST', case=False, na=False).astype(int)


if 'Dates' in df.columns and pd.api.types.is_datetime64_any_dtype(df['Dates']):

    df['Year'] = df['Dates'].dt.year

    df['Month'] = df['Dates'].dt.month

    df['Day'] = df['Dates'].dt.day

    df['Hour'] = df['Dates'].dt.hour

    df['Incident_Quarter'] = df['Dates'].dt.to_period('Q').astype(str)


# 8. Encode categorical variables (one-hot)

categorical_cols = [c for c in ['DayOfWeek','PdDistrict','Resolution'] if c in df.columns]

for c in categorical_cols:

    df[c] = df[c].astype('category')


df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


# Optionally encode target for modeling later

if 'Category' in df_encoded.columns:

    le = LabelEncoder()

    df_encoded['Category_encoded'] = le.fit_transform(df_encoded['Category'])


print('Encoded shape:', df_encoded.shape)

display(df_encoded.head())


# 9. Optional filter examples (by year and district)

if 'Year' in df_encoded.columns:

    df_2015 = df_encoded[df_encoded['Year'] == 2015].copy()

else:

    df_2015 = df_encoded.copy()


if 'PdDistrict' in df.columns:

    mission_idx = df['PdDistrict'] == 'MISSION'

    df_mission = df_encoded[mission_idx].copy()

else:

    df_mission = df_encoded.copy()


print('Rows in 2015:', len(df_2015))

print('Rows in Mission district:', len(df_mission))



Encoded shape: (875659, 44)


Unnamed: 0,Dates,Category,Descript,Address,X,Y,Arrest_Indicator,Year,Month,Day,...,Resolution_JUVENILE CITED,Resolution_JUVENILE DIVERTED,Resolution_LOCATED,Resolution_NONE,Resolution_NOT PROSECUTED,Resolution_PROSECUTED BY OUTSIDE AGENCY,Resolution_PROSECUTED FOR LESSER OFFENSE,Resolution_PSYCHOPATHIC CASE,Resolution_UNFOUNDED,Category_encoded
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,OAK ST / LAGUNA ST,-122.425892,37.774599,1,2015,5,13,...,False,False,False,False,False,False,False,False,False,37
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,OAK ST / LAGUNA ST,-122.425892,37.774599,1,2015,5,13,...,False,False,False,False,False,False,False,False,False,21
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,VANNESS AV / GREENWICH ST,-122.424363,37.800414,1,2015,5,13,...,False,False,False,False,False,False,False,False,False,21
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,1500 Block of LOMBARD ST,-122.426995,37.800873,0,2015,5,13,...,False,False,False,True,False,False,False,False,False,16
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,100 Block of BRODERICK ST,-122.438738,37.771541,0,2015,5,13,...,False,False,False,True,False,False,False,False,False,16


Rows in 2015: 27522
Rows in Mission district: 119722


# Modeling and Evaluation
- Select features and build train/test sets
- Compare multiple algorithms (Logistic Regression, Decision Tree, Random Forest; include XGBoost if available)
- Use cross-validation with Accuracy and Macro-F1
- Report test Accuracy and Macro-F1, and plot the confusion matrix
- Perform hyperparameter tuning (GridSearchCV)
- Visualize important features and model structure/relationships

In [None]:

# 10. Feature selection for modeling the Category
# Target
if 'Category_encoded' not in df_encoded.columns:
    raise ValueError('Target `Category_encoded` not found; ensure earlier cell ran successfully.')

y = df_encoded['Category_encoded']

# We'll avoid leakage by excluding Resolution-derived columns and Arrest_Indicator.
# Also avoid high-cardinality Address_* dummies in features to keep model tractable.
# Keep time parts, coordinates, DayOfWeek_*, PdDistrict_*
numeric_keep = [c for c in ['Year','Month','Day','Hour','X','Y'] if c in df_encoded.columns]
keep_prefixes = ('DayOfWeek_', 'PdDistrict_')

X_cols = []
for c in df_encoded.columns:
    if c in numeric_keep:
        X_cols.append(c)
    elif c.startswith(keep_prefixes):
        X_cols.append(c)

# Sanity: Ensure we have features
if not X_cols:
    raise ValueError('No features selected. Check earlier encoding steps.')

X = df_encoded[X_cols].copy()
print('Selected features:', len(X_cols))
print('Sample features:', X_cols[:10])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train/Test shapes:', X_train.shape, X_test.shape)


Selected features: 21
Sample features: ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'DayOfWeek_Monday', 'DayOfWeek_Saturday', 'DayOfWeek_Sunday', 'DayOfWeek_Thursday']
Train/Test shapes: (700527, 21) (175132, 21)


In [None]:

# 11. Cross-validation comparison of baseline models

models = {
    'Logistic': LogisticRegression(max_iter=1000, n_jobs=None),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}

# Optionally include XGBoost if available
try:
    models['XGBoost'] = XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8,
        objective='multi:softprob', eval_metric='mlogloss', n_jobs=-1, random_state=42
    )
except Exception as e:
    print('XGBoost not available, skipping.')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'acc':'accuracy','f1_macro':'f1_macro'}

cv_results = {}
for name, clf in models.items():
    res = cross_validate(clf, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False)
    cv_results[name] = {k: (v.mean(), v.std()) for k, v in res.items() if k.startswith('test_')}

print('CV (5-fold) results:')
for name, res in cv_results.items():
    print(f"{name:12s} | Acc: {res['test_acc'][0]:.3f} ± {res['test_acc'][1]:.3f} | F1_macro: {res['test_f1_macro'][0]:.3f} ± {res['test_f1_macro'][1]:.3f}")


KeyboardInterrupt: 

In [None]:

# 12. Train/test evaluation and confusion matrix

fit_results = {}
for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average='macro')
    fit_results[name] = {'acc': acc, 'f1_macro': f1m, 'y_pred': y_pred}
    print(f"{name} -> Test Accuracy: {acc:.3f}, Macro-F1: {f1m:.3f}")

# Pick best by Macro-F1
best_name = max(fit_results, key=lambda k: fit_results[k]['f1_macro'])
print(f"Best (by Macro-F1): {best_name}")

y_pred_best = fit_results[best_name]['y_pred']
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8,6))
sns.heatmap(cm, cmap='Blues', cbar=False)
plt.title(f'Confusion Matrix — {best_name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

print('Classification report for best model:')
print(classification_report(y_test, y_pred_best))


In [None]:

# 13. Hyperparameter tuning (GridSearchCV)

search_spaces = {
    'Logistic': (LogisticRegression(max_iter=1000),
                 {'C':[0.1,1,3], 'penalty':['l2'], 'solver':['lbfgs'] }),
    'DecisionTree': (DecisionTreeClassifier(random_state=42),
                     {'max_depth':[None,10,20], 'min_samples_split':[2,10,50]}),
    'RandomForest': (RandomForestClassifier(random_state=42, n_jobs=-1),
                     {'n_estimators':[200,400], 'max_depth':[None,20], 'min_samples_split':[2,10]})
}

best_estimators = {}
for name, (est, grid) in search_spaces.items():
    print(f"Tuning {name} ...")
    gs = GridSearchCV(est, grid, cv=3, scoring='f1_macro', n_jobs=-1, verbose=0)
    gs.fit(X_train, y_train)
    print('Best params:', gs.best_params_)
    print('Best CV f1_macro:', gs.best_score_)
    best_estimators[name] = gs.best_estimator_

# Evaluate tuned models
for name, est in best_estimators.items():
    y_pred = est.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average='macro')
    print(f"{name} (tuned) -> Test Accuracy: {acc:.3f}, Macro-F1: {f1m:.3f}")


In [None]:

# 14. Visualizations: feature importance and tree structure (if applicable)

# Feature importances for RandomForest (tuned if available)
rf = best_estimators.get('RandomForest', models.get('RandomForest'))
if hasattr(rf, 'feature_importances_'):
    importances = rf.feature_importances_
    idx = np.argsort(importances)[::-1][:15]
    plt.figure(figsize=(8,5))
    plt.barh(range(len(idx)), importances[idx][::-1])
    plt.yticks(range(len(idx)), [X.columns[i] for i in idx][::-1])
    plt.title('Top 15 Feature Importances — RandomForest')
    plt.tight_layout()
    plt.show()

# Plot a shallow Decision Tree for interpretability
clf_tree = best_estimators.get('DecisionTree', DecisionTreeClassifier(max_depth=3, random_state=42))
if not hasattr(clf_tree, 'tree_'):
    clf_tree.fit(X_train, y_train)
plt.figure(figsize=(12,6))
plot_tree(clf_tree, feature_names=X.columns.tolist(), max_depth=3, filled=True, fontsize=6)
plt.title('Decision Tree (depth<=3)')
plt.tight_layout()
plt.show()
