<h1><center><strong>San Francisco Crime Classification</strong></center></h1>
<h3><center>Data Mining | Fall 2023</center></h3>

Contributors:
- Kevin Reynolds
- Shivani Merchant
- Kyrsti Fitts
- Ryan Espejo

# Data Preprocessing

In [43]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import warnings

# Silence warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load training data and assign names to attributes
training_attribute_names = ["Dates", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y"]
training_data = pd.read_csv('data/train.csv', skiprows=1, names=training_attribute_names)

# Limit data to prevent crashing
# training_data = training_data.sample(n=100000, random_state=24)

# Remove irrelevant columns
relevant_data = ["Dates", "DayOfWeek", "PdDistrict", "Address", "Category"]
training_data = training_data[relevant_data]
training_data.head()


Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,Category
0,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,WARRANTS
1,2015-05-13 23:53:00,Wednesday,NORTHERN,OAK ST / LAGUNA ST,OTHER OFFENSES
2,2015-05-13 23:33:00,Wednesday,NORTHERN,VANNESS AV / GREENWICH ST,OTHER OFFENSES
3,2015-05-13 23:30:00,Wednesday,NORTHERN,1500 Block of LOMBARD ST,LARCENY/THEFT
4,2015-05-13 23:30:00,Wednesday,PARK,100 Block of BRODERICK ST,LARCENY/THEFT


In [44]:
# Load training data and assign names to attributes
test_attribute_names = ["Id", "Dates", "DayOfWeek", "PdDistrict", "Address", "X", "Y"]
test_data = pd.read_csv('data/test.csv', skiprows=1, names=test_attribute_names)

# remove duplicates from train and test
training_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)

# Use a random sample of n rows
# test_data = test_data.sample(n=100000, random_state=50)

# Remove IDs from test data. We will copy them and use them later for final output
# We won't need them for fitting or predicting, so it's best to remove them for now
test_ids = test_data['Id'].copy()
test_data.drop('Id', axis=1, inplace=True)

test_data.drop('X', axis=1, inplace=True)
test_data.drop('Y', axis=1, inplace=True)

test_data.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address
0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV
1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV
2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST
3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST
4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST


In [45]:
# Convert Dates column to datetime
training_data['Dates'] = pd.to_datetime(training_data['Dates'])

# Parse date into Year, Month, Hour
training_data['Year'] = training_data['Dates'].dt.year
training_data['Month'] = training_data['Dates'].dt.month
training_data['Hour'] = training_data['Dates'].dt.hour
training_data.drop('Dates', axis=1, inplace=True)

# Map DayOfWeek to a value between 1-7
training_data['DayOfWeek'].replace(to_replace=['Monday', "Tuesday", 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], value=[i for i in range(1,8)], inplace=True)

# Function to map hours to day parts
def map_hour_to_daypart(hour):
    if 5 <= hour <= 11:  # Morning (5-11)
        return 1
    elif 12 <= hour <= 21:  # Afternoon (12-20)
        return 2
    else:  # Evening and Night (21-4)
        return 3

# Replace Hour with a DayPart (morning=1, afternoon=2, night=3)
training_data['DayPart'] = training_data['Hour'].apply(map_hour_to_daypart)
training_data.drop('Hour', axis=1, inplace=True)

training_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,Address,Category,Year,Month,DayPart
0,3,NORTHERN,OAK ST / LAGUNA ST,WARRANTS,2015,5,3
1,3,NORTHERN,OAK ST / LAGUNA ST,OTHER OFFENSES,2015,5,3
2,3,NORTHERN,VANNESS AV / GREENWICH ST,OTHER OFFENSES,2015,5,3
3,3,NORTHERN,1500 Block of LOMBARD ST,LARCENY/THEFT,2015,5,3
4,3,PARK,100 Block of BRODERICK ST,LARCENY/THEFT,2015,5,3


In [46]:
# Convert Dates column to datetime
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

# Parse date into Year, Month, Hour
test_data['Year'] = test_data['Dates'].dt.year
test_data['Month'] = test_data['Dates'].dt.month
test_data['Hour'] = test_data['Dates'].dt.hour
test_data.drop('Dates', axis=1, inplace=True)

# Map DayOfWeek to a value between 1-7
test_data['DayOfWeek'].replace(to_replace=['Monday', "Tuesday", 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], value=[i for i in range(1,8)], inplace=True)

# Replace Hour with a DayPart (morning=1, afternoon=2, night=3)
test_data['DayPart'] = test_data['Hour'].apply(map_hour_to_daypart)
test_data.drop('Hour', axis=1, inplace=True)

test_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,Address,Year,Month,DayPart
0,7,BAYVIEW,2000 Block of THOMAS AV,2015,5,3
1,7,BAYVIEW,3RD ST / REVERE AV,2015,5,3
2,7,NORTHERN,2000 Block of GOUGH ST,2015,5,3
3,7,INGLESIDE,4700 Block of MISSION ST,2015,5,3
4,7,INGLESIDE,4700 Block of MISSION ST,2015,5,3


In [47]:
# Encode the rest of the columns
encoded_training_data = training_data.copy()
encoded_test_data = test_data.copy()

columns_to_encode = ['Address', 'Year', 'PdDistrict']

for col in columns_to_encode:
    combined = pd.concat([encoded_training_data[col], encoded_test_data[col]], axis=0).astype(str)
    label_encoder = LabelEncoder().fit(combined)
    
    encoded_training_data[col] = label_encoder.transform(encoded_training_data[col].astype(str))
    encoded_test_data[col] = label_encoder.transform(encoded_test_data[col].astype(str))

encoded_training_data.head()


Unnamed: 0,DayOfWeek,PdDistrict,Address,Category,Year,Month,DayPart
0,3,4,20895,WARRANTS,12,5,3
1,3,4,20895,OTHER OFFENSES,12,5,3
2,3,4,24169,OTHER OFFENSES,12,5,3
3,3,4,4418,LARCENY/THEFT,12,5,3
4,3,5,1923,LARCENY/THEFT,12,5,3


In [48]:
encoded_test_data.head()

Unnamed: 0,DayOfWeek,PdDistrict,Address,Year,Month,DayPart
0,7,0,6626,12,5,3
1,7,0,10069,12,5,3
2,7,4,6553,12,5,3
3,7,2,10985,12,5,3
4,7,2,10985,12,5,3


### Fit Random Forest

In [49]:
# Separate targets from training data
X_train = encoded_training_data.drop(columns='Category')
y_train = training_data['Category']

# Encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [50]:
# Let it cook
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train_encoded)

### Predict with Random Forest

In [51]:
# Predict categories for the test data
predicted_categories_encoded = model.predict(encoded_test_data)

## Output

In [52]:
# Create a DataFrame for the one-hot encoded predictions
num_categories = len(label_encoder.classes_)
one_hot_predictions = np.zeros((len(predicted_categories_encoded), num_categories), dtype=int)

# Populate the appropriate category column with 1s
for index, category in enumerate(predicted_categories_encoded):one_hot_predictions[index, category] = 1

# Create DataFrame for final output
final_output = pd.DataFrame(one_hot_predictions, columns=label_encoder.classes_)
final_output.insert(0, 'Id', test_ids.values)

# Convert to integer type and print first few rows
final_output = final_output.astype(int)
print(final_output.head(20).to_string(index=False, header=False))

 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 4 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 0 0 0

In [53]:
sorted_category_percentages_desc = ((final_output.iloc[:, 1:].sum() / len(final_output)) * 100).sort_values(ascending=False)
print(sorted_category_percentages_desc)

LARCENY/THEFT                  29.921222
OTHER OFFENSES                 15.855142
NON-CRIMINAL                   12.389088
ASSAULT                         7.742841
VEHICLE THEFT                   4.818368
VANDALISM                       4.482156
DRUG/NARCOTIC                   4.249419
BURGLARY                        4.115635
SUSPICIOUS OCC                  2.978981
WARRANTS                        2.978642
MISSING PERSON                  2.442715
ROBBERY                         2.023269
FRAUD                           1.111888
FORGERY/COUNTERFEITING          0.874515
PROSTITUTION                    0.721732
TRESPASS                        0.503810
WEAPON LAWS                     0.409155
SECONDARY CODES                 0.311220
RECOVERED VEHICLE               0.310315
SEX OFFENSES FORCIBLE           0.273788
DISORDERLY CONDUCT              0.231040
DRUNKENNESS                     0.175627
RUNAWAY                         0.168728
STOLEN PROPERTY                 0.141361
ARSON           

## Training data crime percentages
#### We would expect that our predictions would be distributed similar to these numbers

In [54]:
category_counts = training_data['Category'].value_counts(normalize=True) * 100
print(category_counts);

LARCENY/THEFT                  21.238957
OTHER OFFENSES                 14.363537
NON-CRIMINAL                   11.122941
ASSAULT                         8.429685
VANDALISM                       5.384418
VEHICLE THEFT                   5.104409
DRUG/NARCOTIC                   4.894557
WARRANTS                        4.828708
BURGLARY                        4.458235
SUSPICIOUS OCC                  3.807999
ROBBERY                         2.777320
MISSING PERSON                  2.388261
FRAUD                           1.980986
FORGERY/COUNTERFEITING          1.233393
SECONDARY CODES                 1.222316
WEAPON LAWS                     0.932830
TRESPASS                        0.894552
PROSTITUTION                    0.712761
STOLEN PROPERTY                 0.551156
DRUNKENNESS                     0.526294
DISORDERLY CONDUCT              0.525186
SEX OFFENSES FORCIBLE           0.476077
RECOVERED VEHICLE               0.384997
KIDNAPPING                      0.275824
DRIVING UNDER TH

## Crime prediction percentages

In [55]:
sorted_category_percentages_desc = ((final_output.iloc[:, 1:].sum() / len(final_output)) * 100).sort_values(ascending=False)
print(sorted_category_percentages_desc)

LARCENY/THEFT                  29.921222
OTHER OFFENSES                 15.855142
NON-CRIMINAL                   12.389088
ASSAULT                         7.742841
VEHICLE THEFT                   4.818368
VANDALISM                       4.482156
DRUG/NARCOTIC                   4.249419
BURGLARY                        4.115635
SUSPICIOUS OCC                  2.978981
WARRANTS                        2.978642
MISSING PERSON                  2.442715
ROBBERY                         2.023269
FRAUD                           1.111888
FORGERY/COUNTERFEITING          0.874515
PROSTITUTION                    0.721732
TRESPASS                        0.503810
WEAPON LAWS                     0.409155
SECONDARY CODES                 0.311220
RECOVERED VEHICLE               0.310315
SEX OFFENSES FORCIBLE           0.273788
DISORDERLY CONDUCT              0.231040
DRUNKENNESS                     0.175627
RUNAWAY                         0.168728
STOLEN PROPERTY                 0.141361
ARSON           