<h1><center><strong>San Francisco Crime Classification</strong></center></h1>
<h3><center>Data Mining | Fall 2023</center></h3>

Contributors:
- Kevin Reynolds
- Shivani Merchant
- Kyrsti Fitts
- Ryan Espejo

# Data Preprocessing

In [213]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import warnings

# Silence warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load training data and assign names to attributes
training_attribute_names = ["Dates", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y"]
training_data = pd.read_csv('data/train.csv', skiprows=1, names=training_attribute_names)

# Remove irrelevant columns
relevant_data = ["Dates", "PdDistrict", "Address", "Category"]
training_data = training_data[relevant_data]
training_data.head()


Unnamed: 0,Dates,PdDistrict,Address,Category
0,2015-05-13 23:53:00,NORTHERN,OAK ST / LAGUNA ST,WARRANTS
1,2015-05-13 23:53:00,NORTHERN,OAK ST / LAGUNA ST,OTHER OFFENSES
2,2015-05-13 23:33:00,NORTHERN,VANNESS AV / GREENWICH ST,OTHER OFFENSES
3,2015-05-13 23:30:00,NORTHERN,1500 Block of LOMBARD ST,LARCENY/THEFT
4,2015-05-13 23:30:00,PARK,100 Block of BRODERICK ST,LARCENY/THEFT


In [214]:
# Load training data and assign names to attributes
test_attribute_names = ["Id", "Dates", "DayOfWeek", "PdDistrict", "Address", "X", "Y"]
test_data = pd.read_csv('data/test.csv', skiprows=1, names=test_attribute_names)

# remove duplicates from train and test
training_data.drop_duplicates(inplace=True)

# Remove IDs from test data. We will copy them and use them later for final output
test_ids = test_data['Id'].copy()

# Drop unwanted data
test_data.drop('Id', axis=1, inplace=True)
test_data.drop('DayOfWeek', axis=1, inplace=True)
test_data.drop('X', axis=1, inplace=True)
test_data.drop('Y', axis=1, inplace=True)

test_data.head()

Unnamed: 0,Dates,PdDistrict,Address
0,2015-05-10 23:59:00,BAYVIEW,2000 Block of THOMAS AV
1,2015-05-10 23:51:00,BAYVIEW,3RD ST / REVERE AV
2,2015-05-10 23:50:00,NORTHERN,2000 Block of GOUGH ST
3,2015-05-10 23:45:00,INGLESIDE,4700 Block of MISSION ST
4,2015-05-10 23:45:00,INGLESIDE,4700 Block of MISSION ST


In [215]:
# Convert Dates column to datetime
training_data['Dates'] = pd.to_datetime(training_data['Dates'])

# Extract hour
training_data['Hour'] = training_data['Dates'].dt.hour
training_data.drop('Dates', axis=1, inplace=True)

training_data.head()

Unnamed: 0,PdDistrict,Address,Category,Hour
0,NORTHERN,OAK ST / LAGUNA ST,WARRANTS,23
1,NORTHERN,OAK ST / LAGUNA ST,OTHER OFFENSES,23
2,NORTHERN,VANNESS AV / GREENWICH ST,OTHER OFFENSES,23
3,NORTHERN,1500 Block of LOMBARD ST,LARCENY/THEFT,23
4,PARK,100 Block of BRODERICK ST,LARCENY/THEFT,23


In [216]:
# Convert Dates column to datetime
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

# Extract Hour
test_data['Hour'] = test_data['Dates'].dt.hour
test_data.drop('Dates', axis=1, inplace=True)

test_data.head()

Unnamed: 0,PdDistrict,Address,Hour
0,BAYVIEW,2000 Block of THOMAS AV,23
1,BAYVIEW,3RD ST / REVERE AV,23
2,NORTHERN,2000 Block of GOUGH ST,23
3,INGLESIDE,4700 Block of MISSION ST,23
4,INGLESIDE,4700 Block of MISSION ST,23


In [217]:
# Encode the rest of the columns
encoded_training_data = training_data.copy()
encoded_test_data = test_data.copy()

columns_to_encode = ['Address', 'PdDistrict']

for col in columns_to_encode:
    combined = pd.concat([encoded_training_data[col], encoded_test_data[col]], axis=0).astype(str)
    label_encoder = LabelEncoder().fit(combined)
    
    encoded_training_data[col] = label_encoder.transform(encoded_training_data[col].astype(str))
    encoded_test_data[col] = label_encoder.transform(encoded_test_data[col].astype(str))

encoded_training_data.head()


Unnamed: 0,PdDistrict,Address,Category,Hour
0,4,20895,WARRANTS,23
1,4,20895,OTHER OFFENSES,23
2,4,24169,OTHER OFFENSES,23
3,4,4418,LARCENY/THEFT,23
4,5,1923,LARCENY/THEFT,23


In [218]:
encoded_test_data.head()

Unnamed: 0,PdDistrict,Address,Hour
0,0,6626,23
1,0,10069,23
2,4,6553,23
3,2,10985,23
4,2,10985,23


### Fit Decision Tree

In [219]:
# Separate targets from training data
X_train = encoded_training_data.drop(columns='Category')
y_train = training_data['Category']

# Encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [220]:
# Let it cook
model = DecisionTreeClassifier(max_depth=8)
model.fit(X_train, y_train_encoded)

### Predict with Decision Tree

In [221]:
# Predict categories for the test data
predicted_categories_encoded = model.predict_proba(encoded_test_data)

## Output

In [222]:
# Create a DataFrame for the predicted probabilities
num_categories = len(label_encoder.classes_)
predicted_categories = label_encoder.classes_
predicted_probabilities_df = pd.DataFrame(predicted_categories_encoded, columns=predicted_categories)

# Insert 'Id' column
predicted_probabilities_df.insert(0, 'Id', test_ids.values)

# Round the probabilities
predicted_probabilities_df = predicted_probabilities_df.round(2)

# Save to CSV
predicted_probabilities_df.to_csv('predictions/general.csv', index=False, header=True)