# PFA case prediction of crime category

## Import libraries

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.ticker as ticker
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

## Load data

In [36]:
# Load the datasets
sf_data = pd.read_csv('C:/Users/harun/Desktop/PFAcase/sf_data.csv', sep=';')
sf_districts = pd.read_csv('C:/Users/harun/Desktop/PFAcase/sf_districts.csv', sep=';')

In [37]:
# merge on ID
df = pd.merge(sf_data, sf_districts, on='id')
df = df.drop(columns=['id'])
df.head(8)

Unnamed: 0,category,description,weekday,date,time,resolution,longitude,latitude,label,district
0,assault,false imprisonment,saturday,03/24/2018,07:00,none,-122.419053,37.758632,violent,mission
1,non-criminal,lost property,wednesday,07/19/2017,12:00,none,-122.419672,37.76505,other,mission
2,non-criminal,"aided case, mental disturbed",friday,10/13/2017,06:45,none,-122.416894,37.784286,other,tenderloin
3,larceny/theft,petty theft from locked auto,sunday,04/22/2018,18:00,none,-122.420691,37.781483,other,tenderloin
4,larceny/theft,petty theft of property,sunday,08/21/2016,10:00,none,-122.417885,37.785438,other,tenderloin
5,larceny/theft,petty theft shoplifting,wednesday,02/24/2016,12:36,"arrest, booked",-122.406521,37.785063,other,tenderloin
6,vandalism,"malicious mischief, vandalism",wednesday,09/13/2017,10:00,none,-122.417145,37.71215,other,sunnydale
7,suspicious occ,suspicious occurrence,tuesday,08/15/2017,12:23,none,-122.409011,37.781134,other,tenderloin


In [38]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592854 entries, 0 to 592853
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   category     592854 non-null  object 
 1   description  592854 non-null  object 
 2   weekday      592854 non-null  object 
 3   date         592854 non-null  object 
 4   time         592854 non-null  object 
 5   resolution   592854 non-null  object 
 6   longitude    592854 non-null  float64
 7   latitude     592854 non-null  float64
 8   label        592854 non-null  object 
 9   district     592854 non-null  object 
dtypes: float64(2), object(8)
memory usage: 45.2+ MB


## Information about dataset

- 592,854 instances of recorded crimes
- 10 columns (8-9 features, 1 target which is either label or category)
- I would like to know the category, and use the label as a feature. However, it could also have been the other way around for more simplicity.
- no null values

## Data Preprocessing

- Data cleaning
- Feature Engineering
- Feature encoding (done after the train-test-split to avoid data leakage)
    - Integer encode / label encode ordinal categorical features 
    - One hot encode nominal categorical features (weekday, district, category)

### Transform to python datetimes and make columns for year, month, day, hour and minute

In [39]:
# Transform the date into python datetime
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

# Create a new columns for the year, month and day
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Convert the 'time' column to just time data (feature)
df['time'] = pd.to_datetime(df['time'], format='%H:%M').dt.time

# convert 'time' back to datetime to extract the hour
df['hour'] = pd.to_datetime(df['time'].astype(str), format='%H:%M:%S').dt.hour
df['minute'] = pd.to_datetime(df['time'].astype(str), format='%H:%M:%S').dt.minute

# drop date since we have already extracted the year, month and day
df.drop('date', axis=1, inplace=True)

# drop time since we have already extracted the hour and minute
df.drop('time', axis=1, inplace=True)

df.head(8)

Unnamed: 0,category,description,weekday,resolution,longitude,latitude,label,district,year,month,day,hour,minute
0,assault,false imprisonment,saturday,none,-122.419053,37.758632,violent,mission,2018,3,24,7,0
1,non-criminal,lost property,wednesday,none,-122.419672,37.76505,other,mission,2017,7,19,12,0
2,non-criminal,"aided case, mental disturbed",friday,none,-122.416894,37.784286,other,tenderloin,2017,10,13,6,45
3,larceny/theft,petty theft from locked auto,sunday,none,-122.420691,37.781483,other,tenderloin,2018,4,22,18,0
4,larceny/theft,petty theft of property,sunday,none,-122.417885,37.785438,other,tenderloin,2016,8,21,10,0
5,larceny/theft,petty theft shoplifting,wednesday,"arrest, booked",-122.406521,37.785063,other,tenderloin,2016,2,24,12,36
6,vandalism,"malicious mischief, vandalism",wednesday,none,-122.417145,37.71215,other,sunnydale,2017,9,13,10,0
7,suspicious occ,suspicious occurrence,tuesday,none,-122.409011,37.781134,other,tenderloin,2017,8,15,12,23


## Train Test Split

In [40]:
# Split the data into features and target variable
X = df.drop('category', axis=1)
y = df['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# print(X_train.isnull().sum())
# print(X_test.isnull().sum())

### One-hot encoding of nominal features and label encoding of target variable

In [42]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Initialize the OneHotEncoder
ohe = OneHotEncoder()

# List of nominal features (excluding the target variable 'category')
nominal_features = ['district', 'resolution', 'weekday', 'label']

# Apply OneHotEncoder to X_train and transform X_test
X_train_ohe = ohe.fit_transform(X_train[nominal_features])
X_test_ohe = ohe.transform(X_test[nominal_features])

# Reset the index of X_train and X_test
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

# Convert the sparse matrices to dense arrays and then to DataFrames
X_train_ohe_df = pd.DataFrame(X_train_ohe.toarray(), columns=ohe.get_feature_names_out(nominal_features))
X_test_ohe_df = pd.DataFrame(X_test_ohe.toarray(), columns=ohe.get_feature_names_out(nominal_features))

# Drop the original nominal features from X_train and X_test
X_train = X_train.drop(columns=nominal_features)
X_test = X_test.drop(columns=nominal_features)

# Concatenate the new one-hot encoded features with X_train and X_test
X_train = pd.concat([X_train, X_train_ohe_df], axis=1)
X_test = pd.concat([X_test, X_test_ohe_df], axis=1)

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply the LabelEncoder to the 'category' column
# Fit the encoder on the y_train data and transform y_train
y_train_encoded = le.fit_transform(y_train)

# Transform y_test data using the same encoder
y_test_encoded = le.transform(y_test)

In [43]:
# mapping of categories
category_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
category_mapping

{'arson': 0,
 'assault': 1,
 'bad checks': 2,
 'bribery': 3,
 'burglary': 4,
 'disorderly conduct': 5,
 'driving under the influence': 6,
 'drug/narcotic': 7,
 'drunkenness': 8,
 'embezzlement': 9,
 'extortion': 10,
 'forgery/counterfeiting': 11,
 'fraud': 12,
 'gambling': 13,
 'kidnapping': 14,
 'larceny/theft': 15,
 'liquor laws': 16,
 'loitering': 17,
 'missing person': 18,
 'non-criminal': 19,
 'other offenses': 20,
 'pornography/obscene mat': 21,
 'prostitution': 22,
 'recovered vehicle': 23,
 'robbery': 24,
 'secondary codes': 25,
 'sex offenses, forcible': 26,
 'sex offenses, non forcible': 27,
 'stolen property': 28,
 'suicide': 29,
 'suspicious occ': 30,
 'trespass': 31,
 'vandalism': 32,
 'vehicle theft': 33,
 'warrants': 34,
 'weapon laws': 35}

In [44]:
# check if there is any null value in the training and testing sets
# X_train.isnull().sum()
# X_test.isnull().sum()
# y_train.isnull().sum()
# y_test.isnull().sum()

## TF-IDF Vectorization

In this section, I apply the TF-IDF  vectorization to the 'description' column of our dataset. 

1. **Initialization of TfidfVectorizer**: I set up the TF-IDF vectorizer with specific parameters to control the complexity of the resulting feature space. These parameters include:
    - `stop_words='english'`: Removes common English words that are unlikely to have predictive power.
    - `max_features=100`: Limits the number of features to the top 100 words by term frequency across the corpus, helping to reduce model complexity and potential overfitting.
    - `min_df=5`: Excludes words that appear in fewer than 5 crimes, thus focusing on more relevant terms.
    - `max_df=0.5`: Ignores terms that appear in more than 50% of the crimes, as these terms are too common and unlikely to be useful for differentiation.

2. **Fitting and Transforming Data**: We fit the vectorizer to the 'description' column of the training data and then transform both the training and test datasets. This conversion results in a matrix of TF-IDF features for each dataset.

3. **Creating DataFrames from Matrices**: The sparse matrices obtained from the transformation are converted to dense arrays and then into DataFrames. These DataFrames are subsequently used in the model training and evaluation process.

By applying TF-IDF, I aim to capture the most significant linguistic patterns in the 'description' data, which should enhance the predictive quality of our model while avoiding overfitting through controlled feature selection.


In [46]:
# I want to use TF-idf instead since I dont  want a large number of features, some of which might be contributing to overfitting.
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=100, min_df=5, max_df=0.5)

# Fit the TfidfVectorizer on the 'description' column of the training data and transform it
X_train_description_tfidf = tfidf.fit_transform(X_train['description']).toarray()

# Transform the 'description' column of the test data
X_test_description_tfidf = tfidf.transform(X_test['description']).toarray()

# Generate column names for the bag of words features
tfidf_feature_names = tfidf.get_feature_names_out()

# Create DataFrames with the bag of words features for training and testing sets
X_train_description_tfidf_df = pd.DataFrame(X_train_description_tfidf, columns=tfidf_feature_names)
X_test_description_tfidf_df = pd.DataFrame(X_test_description_tfidf, columns=tfidf_feature_names)

# Drop the original 'description' column from training and testing sets
X_train = X_train.drop('description', axis=1)
X_test = X_test.drop('description', axis=1)

# Concatenate the new bag of words features with the original training and testing sets
X_train = pd.concat([X_train, X_train_description_tfidf_df], axis=1)
X_test = pd.concat([X_test, X_test_description_tfidf_df], axis=1)

In [47]:
X_train.head(8)

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,district_mission,district_sunnydale,district_tenderloin,...,unlawful,use,vandalism,vehicle,vehicles,violation,violence,visit,warrant,weapon
0,-122.408711,37.780971,2010,8,19,18,5,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-122.406521,37.785063,2015,8,13,16,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-122.4118,37.784089,2011,5,1,21,0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-122.41765,37.754683,2004,5,22,17,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-122.407933,37.781506,2003,1,15,19,42,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0
5,-122.424378,37.768871,2016,7,22,6,30,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,-122.408954,37.783288,2007,1,30,20,52,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-122.416864,37.757946,2009,12,9,23,15,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.748251,0.0


### Model training:
   - Train a classification model.
   - I will explore different algorithms such as RFC, KNN, AdaBoost, DecisionTree and XGBoost to find the best-performing model.

In [48]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

classifiers = [
        # RandomForestClassifier(),
        # KNeighborsClassifier(),
        # AdaBoostClassifier(algorithm='SAMME'),
        # DecisionTreeClassifier()
    ]

In [49]:
for classifier in classifiers:
    model = classifier
    model.fit(X_train, y_train_encoded)
    y_pred = model.predict(X_test)
    print(classifier)
    # print(classification_report(y_test, y_pred))
    print(accuracy_score(y_test_encoded, y_pred))

In [50]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(
    n_estimators=50, 
    objective="multi:softprob",
    learning_rate=0.05,
    gamma=1,
    max_depth=5,
    min_child_weight=6,
    reg_alpha=1.0,
    reg_lambda=1.0,
    subsample=0.33,
    colsample_bytree=0.8,
    random_state=1, 
    n_jobs=4)

xgb.fit(X_train, y_train_encoded)

print(accuracy_score(y_test_encoded, xgb.predict(X_test)))
# # print(classification_report(y_test, xgb.predict(X_test)))

0.9692251899705662


Might be the possibility of overfitting which I will continuously investigate

### Hyperparameter tuning

In [51]:
# For finding best hyperparameters. Might take a long time to run, therefore, it is commented out

# from sklearn.model_selection import GridSearchCV

# # Define your XGBClassifier
# xgb = XGBClassifier(random_state=1, n_jobs=4)

# # Define the grid of hyperparameters to search
# param_grid = {
#     'learning_rate': (0.1, 0.2),
#     'min_child_weight': (0, 10),
#     'max_depth': (1, 100),
#     'max_delta_step': (0, 20),
#     'subsample': (0.01, 1.0),
#     'min_child_weight': (0, 5),
#     'n_estimators': (50, 100),
# }

# # Setup the grid search
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=2, verbose=2)

# # Perform the grid search on your data
# grid_search.fit(X_train, y_train)

# # Get the best parameters and score
# best_parameters = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters:", best_parameters)
# print("Best Score:", best_score)

## Model evaluation

In [54]:
# model evaluation
from sklearn.model_selection import cross_val_score

# Now use cross_val_score with the preprocessed data
scores = cross_val_score(xgb, X_train, y_train_encoded, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)

Cross-validation scores: [0.96834182 0.96838399 0.96845778 0.96939572 0.96884752]


In [57]:
mean_accuracy = scores.mean()
print('Mean Accuracy: %.3f' % (mean_accuracy))
print('Standard Deviation: %.3f' % (scores.std()))

Mean Accuracy: 0.969
Standard Deviation: 0.000
