<h1><center><strong>San Francisco Crime Classification</strong></center></h1>
<h3><center>Data Mining | Fall 2023</center></h3>

Contributors:
- Kevin Reynolds
- Shivani Merchant
- Kyrsti Fitts
- Ryan Espejo

# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import warnings

# Silence warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load data 
training_attribute_names = ["Dates", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y"]
training_data = pd.read_csv('data/train.csv', skiprows=1, names=training_attribute_names)

test_attribute_names = ["Id", "Dates", "DayOfWeek", "PdDistrict", "Address", "X", "Y"]
test_data = pd.read_csv('data/test.csv', skiprows=1, names=test_attribute_names)

# Limit data to prevent crashing
training_data = training_data.sample(n=25000, random_state=24)
test_data = test_data.sample(n=10000, random_state=50)

# Remove IDs from test data. We will copy them and use them later for final output
# We won't need them for fitting or predicting, so it's best to remove them for now
test_ids = test_data['Id'].copy()
test_data.drop('Id', axis=1, inplace=True)

# Remove irrelevant columns
relevant_data = ["Dates", "DayOfWeek", "PdDistrict", "Address", "Category"]
training_data = training_data[relevant_data]

# Convert Dates column to datetime
training_data['Dates'] = pd.to_datetime(training_data['Dates'])
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

# Extract relevant features from the Dates column
training_data['Year'] = training_data['Dates'].dt.year
training_data['Month'] = training_data['Dates'].dt.month
training_data['Hour'] = training_data['Dates'].dt.hour
training_data.drop('Dates', axis=1, inplace=True)

test_data['Year'] = test_data['Dates'].dt.year
test_data['Month'] = test_data['Dates'].dt.month
test_data['Hour'] = test_data['Dates'].dt.hour
test_data.drop('Dates', axis=1, inplace=True)
test_data.drop('X', axis=1, inplace=True)
test_data.drop('Y', axis=1, inplace=True)

# One-hot encode categorical data
encoded_training_data = pd.get_dummies(training_data, columns=['Category', 'Address', 'DayOfWeek', 'PdDistrict', 'Year'], prefix=['Category', 'Address', 'DayOfWeek', 'PdDistrict', 'Year'], sparse=True)
encoded_test_data = pd.get_dummies(test_data, columns=['Address', 'DayOfWeek', 'PdDistrict', 'Year'], prefix=['Address', 'DayOfWeek', 'PdDistrict', 'Year'], sparse=True)

# Match up training data with test data
encoded_test_data = test_data.reindex(columns=[col for col in encoded_training_data.columns if 'Category' not in col], fill_value=0)

# Fit and Predict with Random Forest

In [None]:
# Prepare features and target variable
X_train = encoded_training_data.drop(columns=[col for col in encoded_training_data if col.startswith('Category')])
y_train = encoded_training_data[[col for col in encoded_training_data if col.startswith('Category')]].idxmax(axis=1)

# Encode y_train to have categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(training_data['Category'])

# Train the model
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train_encoded)

# Output

In [None]:
# Predict categories for the test data
predicted_categories_encoded = model.predict(encoded_test_data)

# Create a DataFrame for the one-hot encoded predictions
num_categories = len(label_encoder.classes_)
one_hot_predictions = np.zeros((len(predicted_categories_encoded), num_categories), dtype=int)

# Populate the appropriate category column with 1s
for idx, category in enumerate(predicted_categories_encoded):one_hot_predictions[idx, category] = 1

# Create DataFrame for final output
final_output = pd.DataFrame(one_hot_predictions, columns=label_encoder.classes_)
final_output.insert(0, 'Id', test_ids.values)

# Convert to integer type and print first few rows
final_output = final_output.astype(int)
print(final_output.head(200).to_string(index=False, header=False))