In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [26]:
# =============================
# 1. Load the Data from ZIP Files
# =============================

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# read in the data
# specify the file name and the compression method
train = pd.read_csv('/kaggle/input/sf-crime/train.csv.zip', compression='zip', parse_dates=['Dates'])
test = pd.read_csv('/kaggle/input/sf-crime/test.csv.zip',compression='zip', parse_dates=['Dates'])

In [27]:
# =============================
# 2. Data Preprocessing & Feature Engineering
# =============================

def process_data(df, is_train=True):
    # Extract time-related features from the Dates column.
    df['Hour'] = df['Dates'].dt.hour
    df['Month'] = df['Dates'].dt.month
    df['Year'] = df['Dates'].dt.year
    df['Day'] = df['Dates'].dt.day
    
    # Check if the one-hot encoding has already been done
    if 'PdDistrict' in df.columns and 'DayOfWeek' in df.columns:
        # One-hot encode 'PdDistrict' and 'DayOfWeek'
        df = pd.get_dummies(df, columns=['PdDistrict', 'DayOfWeek'])
    
    # Create an extra feature: flag if the address is an intersection (contains '/')
    if 'Address' in df.columns:
        df['Intersection'] = df['Address'].apply(lambda x: 1 if '/' in x else 0)
    
    if is_train and 'Category' in df.columns:
        # Encode the target variable 'Category'
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        df['CategoryEncoded'] = le.fit_transform(df['Category'])
        return df, le
    else:
        return df

# Process the training data (and get the label encoder)
train, le = process_data(train, is_train=True)

# Define the features: we include coordinates, time-based features, the intersection flag,
# and all one-hot encoded columns for 'PdDistrict' and 'DayOfWeek'.
feature_cols = ['X', 'Y', 'Hour', 'Month', 'Year', 'Day', 'Intersection']
feature_cols += [col for col in train.columns if col.startswith('PdDistrict_') or col.startswith('DayOfWeek_')]

X = train[feature_cols]
y = train['CategoryEncoded']

# Split the training data into a training set and a validation set.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=1234
)

In [28]:
# =============================
# 3. Modeling
# =============================
#3.1
# ----- Decision Tree Model -----
tree_model = DecisionTreeClassifier(max_depth=10, random_state=1234)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_val)

print("\nDecision Tree Classification Report:")
print(classification_report(y_val, y_pred_tree, target_names=le.classes_))


Decision Tree Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       303
                    ASSAULT       0.16      0.17      0.17     15375
                 BAD CHECKS       0.00      0.00      0.00        81
                    BRIBERY       0.00      0.00      0.00        58
                   BURGLARY       0.18      0.02      0.04      7351
         DISORDERLY CONDUCT       0.23      0.04      0.06       864
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       454
              DRUG/NARCOTIC       0.31      0.40      0.35     10794
                DRUNKENNESS       0.00      0.00      0.00       856
               EMBEZZLEMENT       0.00      0.00      0.00       233
                  EXTORTION       0.00      0.00      0.00        51
            FAMILY OFFENSES       0.00      0.00      0.00        98
     FORGERY/COUNTERFEITING       0.18      0.00      0.00      2122
                      FRAUD      

  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# ----- Random Forest Model (Second Tree Model) -----
#3.2
rf_model = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=1234)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)

print("\nRandom Forest Classification Report:")
print(classification_report(y_val, y_pred_rf, target_names=le.classes_))


Random Forest Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00       303
                    ASSAULT       0.20      0.14      0.16     15375
                 BAD CHECKS       0.00      0.00      0.00        81
                    BRIBERY       0.00      0.00      0.00        58
                   BURGLARY       0.31      0.00      0.01      7351
         DISORDERLY CONDUCT       0.05      0.00      0.01       864
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       454
              DRUG/NARCOTIC       0.32      0.41      0.36     10794
                DRUNKENNESS       0.00      0.00      0.00       856
               EMBEZZLEMENT       0.00      0.00      0.00       233
                  EXTORTION       0.00      0.00      0.00        51
            FAMILY OFFENSES       0.00      0.00      0.00        98
     FORGERY/COUNTERFEITING       0.09      0.00      0.00      2122
                      FRAUD      

  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# ----- Support Vector Machine (SVM) -----
#3.3
# Due to scalability concerns with SVMs, we take a sample from the data.
svm_sample = train.sample(n=10000, random_state=1234)
X_svm = svm_sample[feature_cols]
y_svm = svm_sample['CategoryEncoded']

import numpy as np
from sklearn.metrics import classification_report

# Get the unique classes present in the SVM sample.
unique_classes = np.unique(y_svm)
# Get the corresponding target names using the label encoder.
target_names = le.inverse_transform(unique_classes)

# Generate the classification report using only the classes in the SVM sample.
print("\nSVM Classification Report (trained on sample):")
print(classification_report(y_svm, y_pred_svm, labels=unique_classes, target_names=target_names))




SVM Classification Report (trained on sample):
                             precision    recall  f1-score   support

                      ARSON       0.00      0.00      0.00        17
                    ASSAULT       0.07      0.02      0.03       911
                 BAD CHECKS       0.00      0.00      0.00         6
                    BRIBERY       0.00      0.00      0.00         2
                   BURGLARY       0.00      0.00      0.00       418
         DISORDERLY CONDUCT       0.00      0.00      0.00        41
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00        21
              DRUG/NARCOTIC       0.05      0.03      0.04       607
                DRUNKENNESS       0.00      0.00      0.00        40
               EMBEZZLEMENT       0.00      0.00      0.00        14
                  EXTORTION       0.00      0.00      0.00         1
            FAMILY OFFENSES       0.00      0.00      0.00         7
     FORGERY/COUNTERFEITING       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:

def create_submission_file(model, X_test, test_index, label_encoder, filename):
    """
    Given a trained model, the test set and the label encoder,
    create a one-hot encoded submission file with 1 indicating the predicted class.
    
    Parameters:
      model: trained classifier (must have a predict() method)
      X_test: test set features DataFrame
      test_index: index values (or an ID column) for the test set
      label_encoder: fitted LabelEncoder used for the target categories
      filename: file name to save the submission CSV
    """
    # Get predictions (class indices)
    preds = model.predict(X_test)
    # Convert numerical predictions to original class names
    pred_class_names = label_encoder.inverse_transform(preds)
    
    # Create a DataFrame initialized to zeros for all classes. The order of classes
    # will be the same as in label_encoder.classes_
    submission = pd.DataFrame(0, index=test_index, columns=label_encoder.classes_)
    
    # Set the appropriate column to 1 for each prediction
    # Looping through rows – acceptable since test set sizes are often not huge.
    for idx, class_name in zip(test_index, pred_class_names):
        submission.at[idx, class_name] = 1

    # Insert the 'Id' column as required
    submission.insert(0, 'Id', test_index)
    
    # Save the submission file
    submission.to_csv(filename, index=False)
    print(f"Submission file '{filename}' created successfully.")

In [32]:
# =============================
# 4. Prepare Kaggle Submission for All 3 Models
# =============================

# Load the test dataset from the zipped CSV file.
test = pd.read_csv('/kaggle/input/sf-crime/test.csv.zip', compression='zip', parse_dates=['Dates'])
# Process the test data using the same function (no target column)
test = process_data(test, is_train=False)

# Select the same feature columns as in training.
X_test = test[feature_cols]
test_index = test.index  

# Create submission files for the Decision Tree, Random Forest, and SVM models.
create_submission_file(tree_model, X_test, test_index, le, 'submission_tree.csv')
create_submission_file(rf_model, X_test, test_index, le, 'submission_rf.csv')
create_submission_file(svm_model, X_test, test_index, le, 'submission_svm.csv')

Submission file 'submission_tree.csv' created successfully.
Submission file 'submission_rf.csv' created successfully.
Submission file 'submission_svm.csv' created successfully.
