<h1><center><strong>San Francisco Crime Classification</strong></center></h1>
<h3><center>Data Mining | Fall 2023</center></h3>

Contributors:
- Kevin Reynolds
- Shivani Merchant
- Kyrsti Fitts
- Ryan Espejo

# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import warnings

# Silence warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load training data and assign names to attributes
training_attribute_names = ["Dates", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y"]
training_data = pd.read_csv('data/train.csv', skiprows=1, names=training_attribute_names)

# Limit data to prevent crashing
# training_data = training_data.sample(n=100000, random_state=24)

# Remove irrelevant columns
relevant_data = ["Dates", "DayOfWeek", "PdDistrict", "Address", "Category"]
training_data = training_data[relevant_data]
training_data.head()


In [None]:
# Load training data and assign names to attributes
test_attribute_names = ["Id", "Dates", "DayOfWeek", "PdDistrict", "Address", "X", "Y"]
test_data = pd.read_csv('data/test.csv', skiprows=1, names=test_attribute_names)

# remove duplicates from train and test
training_data.drop_duplicates(inplace=True)

# Remove IDs from test data. We will copy them and use them later for final output
# We won't need them for fitting or predicting, so it's best to remove them for now
test_ids = test_data['Id'].copy()
test_data.drop('Id', axis=1, inplace=True)

test_data.drop('X', axis=1, inplace=True)
test_data.drop('Y', axis=1, inplace=True)

test_data.head()

In [None]:
# Convert Dates column to datetime
training_data['Dates'] = pd.to_datetime(training_data['Dates'])

# Parse date into Year, Month, Hour
training_data['Year'] = training_data['Dates'].dt.year
training_data['Month'] = training_data['Dates'].dt.month
training_data['Hour'] = training_data['Dates'].dt.hour
training_data.drop('Dates', axis=1, inplace=True)

# Map DayOfWeek to a value between 1-7
training_data['DayOfWeek'].replace(to_replace=['Monday', "Tuesday", 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], value=[i for i in range(1,8)], inplace=True)

training_data.head()

In [None]:
# Convert Dates column to datetime
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

# Parse date into Year, Month, Hour
test_data['Year'] = test_data['Dates'].dt.year
test_data['Month'] = test_data['Dates'].dt.month
test_data['Hour'] = test_data['Dates'].dt.hour
test_data.drop('Dates', axis=1, inplace=True)

# Map DayOfWeek to a value between 1-7
test_data['DayOfWeek'].replace(to_replace=['Monday', "Tuesday", 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], value=[i for i in range(1,8)], inplace=True)

test_data.head()

In [None]:
# Encode the rest of the columns
encoded_training_data = training_data.copy()
encoded_test_data = test_data.copy()

columns_to_encode = ['Address', 'PdDistrict']

for col in columns_to_encode:
    combined = pd.concat([encoded_training_data[col], encoded_test_data[col]], axis=0).astype(str)
    label_encoder = LabelEncoder().fit(combined)
    
    encoded_training_data[col] = label_encoder.transform(encoded_training_data[col].astype(str))
    encoded_test_data[col] = label_encoder.transform(encoded_test_data[col].astype(str))

encoded_training_data.head()


In [None]:
encoded_test_data.head()

### Fit Decision Tree

In [None]:
# Separate targets from training data
X_train = encoded_training_data.drop(columns='Category')
y_train = training_data['Category']

# Encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [None]:
# Let it cook
model = DecisionTreeClassifier()
model.fit(X_train, y_train_encoded)

### Predict with Decision Tree

In [None]:
# Predict categories for the test data
predicted_categories_encoded = model.predict(encoded_test_data)

## Output

In [None]:
# Create a DataFrame for the one-hot encoded predictions
num_categories = len(label_encoder.classes_)
one_hot_predictions = np.zeros((len(predicted_categories_encoded), num_categories), dtype=int)

# Populate the appropriate category column with 1s
for index, category in enumerate(predicted_categories_encoded):one_hot_predictions[index, category] = 1

# Create DataFrame for final output
final_output = pd.DataFrame(one_hot_predictions, columns=label_encoder.classes_)
final_output.insert(0, 'Id', test_ids.values)

# Convert to integer type and print first few rows
final_output = final_output.astype(int)

final_output.to_csv('data/submission.csv', index=False, header=True)

print(final_output.head(20).to_string(index=False, header=False))

In [None]:
sorted_category_percentages_desc = ((final_output.iloc[:, 1:].sum() / len(final_output)) * 100).sort_values(ascending=False)
print(sorted_category_percentages_desc)

## Training data crime percentages
#### We would expect that our predictions would be distributed similar to these numbers

In [None]:
category_counts = training_data['Category'].value_counts(normalize=True) * 100
print(category_counts);

## Crime prediction percentages

In [None]:
sorted_category_percentages_desc = ((final_output.iloc[:, 1:].sum() / len(final_output)) * 100).sort_values(ascending=False)
print(sorted_category_percentages_desc)