<h1><center><strong>San Francisco Crime Classification</strong></center></h1>
<h3><center>Predict by Address</center></h3>
<h3>Data Mining | Fall 2023</h3>


Contributors:
- Kevin Reynolds
- Shivani Merchant
- Kyrsti Fitts
- Ryan Espejo

# Data Preprocessing

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import warnings

In [None]:
# Silence warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Load training data and assign names to attributes
training_attribute_names = ["Dates", "Category", "Descript", "DayOfWeek", "PdDistrict", "Resolution", "Address", "X", "Y"]
training_data = pd.read_csv('data/train.csv', skiprows=1, names=training_attribute_names)

# Remove irrelevant columns
relevant_data = ["Address", "Category"]
training_data = training_data[relevant_data]
training_data.head()


In [None]:
# Load training data and assign names to attributes
test_attribute_names = ["Id", "Dates", "DayOfWeek", "PdDistrict", "Address", "X", "Y"]
test_data = pd.read_csv('data/test.csv', skiprows=1, names=test_attribute_names)

# remove duplicates from train and test
training_data.drop_duplicates(inplace=True)

# Remove IDs from test data. We will copy them and use them later for final output
test_ids = test_data['Id'].copy()
test_data = test_data[['Address']]

test_data.head()

In [None]:
# Encode the rest of the columns
encoded_training_data = training_data.copy()
encoded_test_data = test_data.copy()

columns_to_encode = ['Address']

for col in columns_to_encode:
    combined = pd.concat([encoded_training_data[col], encoded_test_data[col]], axis=0).astype(str)
    label_encoder = LabelEncoder().fit(combined)
    
    encoded_training_data[col] = label_encoder.transform(encoded_training_data[col].astype(str))
    encoded_test_data[col] = label_encoder.transform(encoded_test_data[col].astype(str))

encoded_training_data.head()


In [None]:
encoded_test_data.head()

### Fit Decision Tree

In [None]:
# Separate targets from training data
X_train = encoded_training_data.drop(columns='Category')
y_train = training_data['Category']

# Encode y_train
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [None]:
# Let it cook
model = RandomForestClassifier(max_depth=6, min_samples_split=4, min_samples_leaf=1)
model.fit(X_train, y_train_encoded)

### Predict with Decision Tree

In [None]:
# Predict categories for the test data
predicted_categories_encoded = model.predict_proba(encoded_test_data)

## Output

In [None]:
# Create a DataFrame for the predicted probabilities
num_categories = len(label_encoder.classes_)
predicted_categories = label_encoder.classes_
predicted_probabilities_df = pd.DataFrame(predicted_categories_encoded, columns=predicted_categories)

# Insert 'Id' column
predicted_probabilities_df.insert(0, 'Id', test_ids.values)

# Round the probabilities to a reasonable number of decimal places
predicted_probabilities_df = predicted_probabilities_df.round(2)

# Save to CSV
predicted_probabilities_df.to_csv('predictions/address.csv', index=False, header=True)
