In [1]:
# !conda install lightgbm

In [25]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

from lightgbm import LGBMClassifier

random_state = 1

In [2]:
df = pd.read_csv('../Data/crime_data_cleaned.csv')

In [3]:
df['datetime_occ'] = df['datetime_occ'].apply(pd.to_datetime)

In [4]:
with open('../Data/crime_types.json') as f:
  crime_dict = json.load(f)

def replace(value):
  for key, val in crime_dict.items():
    if value in val:
      return key
  return np.nan

In [5]:
df['crm_cd_desc'] = df['crm_cd_desc'].apply(lambda x: replace(x))

In [6]:
features = ['area_name', 'vict_age', 'vict_sex', 'vict_descent', 'lat', 'lon', 'datetime_occ']
target = 'crm_cd_desc'

df = df[df['datetime_occ'].dt.year != 2023]

## If you were in Los Angeles, which crime would you encounter under a specific condition?

We have trained a `LGBMClassifier` model to predict the type of crime that might occur in Los Angeles under specific conditions. The model considers various factors, including location, victim demographics, and time of day.
The accuracy is about `0.79` (custom accuracy metric).

### Features and Target

The model utilizes the following features to make predictions:

- `area_name`: The name of the area where the crime occurred.
- `vict_age`: The age of the victim.
- `vict_sex`: The sex of the victim.
- `vict_descent`: The descent of the victim.
- `lat`: The latitude of the crime location.
- `lon`: The longitude of the crime location.
- `datetime_occ`: The date and time of the crime.

The `datetime_occ` column will be split into `year`, `month`, `day`, and `hour` columns, and then it will be dropped.

The model aims to predict the `crm_cd_desc`, which is a description of the crime code.

### Model Training

A brief of training process we have done.

**Pipeline**

Creates a pipeline for data preprocessing and model fitting:
- Imputes missing categorical values with the most frequent values and encodes them using one-hot encoding.
- Imputes missing numerical values using KNNImputer.
- Uses LGBMClassifier as the model.

**Grid Search**

- Performs hyperparameter tuning using GridSearchCV to find the best model configuration.

**Final Model Training**

- Trains the model with the best hyperparameters on the entire training set.

**Evaluation**

- Defines a custom top-n accuracy metric.
- Evaluates model performance on the test set using this metric.

**Model Saving**

- Saves the trained model to a file named 'model.joblib'.

### Making a Prediction

We will select a sample from data set then

In [11]:
df.columns = ['Date Rptd', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes', 'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Desc', 'Weapon Desc', 'Status Desc', 'Crm Cd 1', 'LOCATION', 'LAT', 'LON', 'Datetime OCC']
features = ['AREA NAME', 'Vict Age', 'Vict Sex', 'Vict Descent', 'LAT', 'LON', 'Datetime OCC']
target = 'Crm Cd Desc'

In [18]:
X = df[features].copy()
y = df[target].copy()

# X['year'] = X['datetime_occ'].dt.year
# X['month'] = X['datetime_occ'].dt.month
# X['day'] = X['datetime_occ'].dt.day
# X['hour'] = X['datetime_occ'].dt.hour

X['Year'] = X['Datetime OCC'].dt.year
X['Month'] = X['Datetime OCC'].dt.month
X['Day'] = X['Datetime OCC'].dt.day
X['Hour'] = X['Datetime OCC'].dt.hour

X = X.drop(columns=['Datetime OCC'])

In [19]:
X_sample = X.sample(1)
y_sample = y[X_sample.index]
y_sample

In [None]:
model = joblib.load('../Model/model.joblib')
model

In [None]:
prediction = model.predict_proba(X_sample)

In [None]:
pd.DataFrame(prediction[0], index = model.classes_).sort_values(by = 0, ascending = False)