In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Imported data
file_path = "LMPD_Data.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,INCIDENT_NUMBER,DATE_OCCURED,UOR_DESC,PREMISE_TYPE,OFFICER_ATTACKED
0,80-18-101412,12/29/2018,ENDANGERING THE WELFARE OF A MINOR,HIGHWAY / ROAD / ALLEY,0
1,80-18-101737,12/30/2018,WANTON ENDANGERMENT-2ND DEGREE,HIGHWAY / ROAD / ALLEY,0
2,80-18-990001,10/19/2018,WANTON ENDANGERMENT-1ST DEGREE,RESIDENCE / HOME,0
3,80-19-000003,1/1/2019,ASSAULT - 1ST DEGREE,RESIDENCE / HOME,0
4,80-19-000004,1/1/2019,WANTON ENDANGERMENT-1ST DEGREE,RESIDENCE / HOME,0


In [3]:
# Dropped irrelevant columns for machine learning model
ml_df = df.drop(columns=["INCIDENT_NUMBER", "DATE_OCCURED"])
ml_df.head()

Unnamed: 0,UOR_DESC,PREMISE_TYPE,OFFICER_ATTACKED
0,ENDANGERING THE WELFARE OF A MINOR,HIGHWAY / ROAD / ALLEY,0
1,WANTON ENDANGERMENT-2ND DEGREE,HIGHWAY / ROAD / ALLEY,0
2,WANTON ENDANGERMENT-1ST DEGREE,RESIDENCE / HOME,0
3,ASSAULT - 1ST DEGREE,RESIDENCE / HOME,0
4,WANTON ENDANGERMENT-1ST DEGREE,RESIDENCE / HOME,0


In [4]:
# Converted string values to binary values
assaults_df = pd.get_dummies(ml_df, columns=['UOR_DESC', 'PREMISE_TYPE'])
assaults_df.head()

Unnamed: 0,OFFICER_ATTACKED,UOR_DESC_ABANDONMENT OF MINOR,UOR_DESC_ASSAULT - 1ST DEGREE,UOR_DESC_ASSAULT - 1ST DEGREE - DOMESTIC VIOLENCE,UOR_DESC_ASSAULT - 1ST DEGREE - POLICE OFFICER,UOR_DESC_ASSAULT - 2ND DEGREE,UOR_DESC_ASSAULT - 2ND DEGREE - DOMESTIC VIOLENCE,UOR_DESC_ASSAULT - 2ND DEGREE - POLICE OFFICER,UOR_DESC_ASSAULT - 3RD DEGREE (OFFICER TRANSPORTING INMATES),UOR_DESC_ASSAULT - 3RD DEGREE (SCHOOL EMPLOYEE OR SCHOOL VOLUNTEER),...,PREMISE_TYPE_PARK / PLAYGROUND,PREMISE_TYPE_PARKING LOT / GARAGE,PREMISE_TYPE_RACE TRACK/GAMBLING FACILITY,PREMISE_TYPE_RENTAL / STORAGE FACILITY,PREMISE_TYPE_RESIDENCE / HOME,PREMISE_TYPE_RESTAURANT,PREMISE_TYPE_SCHOOL - COLLEGE / UNIVERSITY,PREMISE_TYPE_SCHOOL - ELEMENTARY / SECONDARY,PREMISE_TYPE_SERVICE / GAS STATION,"PREMISE_TYPE_SPECIALTY STORE (TV, FUR, ETC)"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
# Define the features set.
X = assaults_df.copy()
X = X.drop(columns = ["OFFICER_ATTACKED"])
X.head()

Unnamed: 0,UOR_DESC_ABANDONMENT OF MINOR,UOR_DESC_ASSAULT - 1ST DEGREE,UOR_DESC_ASSAULT - 1ST DEGREE - DOMESTIC VIOLENCE,UOR_DESC_ASSAULT - 1ST DEGREE - POLICE OFFICER,UOR_DESC_ASSAULT - 2ND DEGREE,UOR_DESC_ASSAULT - 2ND DEGREE - DOMESTIC VIOLENCE,UOR_DESC_ASSAULT - 2ND DEGREE - POLICE OFFICER,UOR_DESC_ASSAULT - 3RD DEGREE (OFFICER TRANSPORTING INMATES),UOR_DESC_ASSAULT - 3RD DEGREE (SCHOOL EMPLOYEE OR SCHOOL VOLUNTEER),UOR_DESC_ASSAULT - 4TH DEGREE (CHILD ABUSE),...,PREMISE_TYPE_PARK / PLAYGROUND,PREMISE_TYPE_PARKING LOT / GARAGE,PREMISE_TYPE_RACE TRACK/GAMBLING FACILITY,PREMISE_TYPE_RENTAL / STORAGE FACILITY,PREMISE_TYPE_RESIDENCE / HOME,PREMISE_TYPE_RESTAURANT,PREMISE_TYPE_SCHOOL - COLLEGE / UNIVERSITY,PREMISE_TYPE_SCHOOL - ELEMENTARY / SECONDARY,PREMISE_TYPE_SERVICE / GAS STATION,"PREMISE_TYPE_SPECIALTY STORE (TV, FUR, ETC)"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [6]:
# Define the target set.
y = assaults_df["OFFICER_ATTACKED"]
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: OFFICER_ATTACKED, dtype: int64

In [7]:
# Check the balance of our target values
y.value_counts()

0    31815
1      954
Name: OFFICER_ATTACKED, dtype: int64

In [8]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 23844, 1: 23844})

In [10]:
# Train the Logistic Regression model using the resampled data
ml_model = LogisticRegression(solver='lbfgs', random_state=1)
ml_model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
y_pred = ml_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9003787377893513

In [12]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Officer_Not_Assaulted", "Officer_Assaulted"], columns=["Predicted_Not_Assaulted", "Predicted_Assaulted"])
cm_df

Unnamed: 0,Predicted_Not_Assaulted,Predicted_Assaulted
Officer_Not_Assaulted,7460,511
Officer_Assaulted,30,192


In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.94      0.86      0.97      0.90      0.82      7971
          1       0.27      0.86      0.94      0.42      0.90      0.80       222

avg / total       0.98      0.93      0.87      0.95      0.90      0.81      8193



In [14]:
sorted(zip(ml_model.coef_[0], X.columns), reverse=True)

[(9.937690629224331,
  'UOR_DESC_ASSAULT-3RD DEGREE-POLICE/PROBATION OFFICER-IDENTIFY WEAPON'),
 (9.091383665324548, 'UOR_DESC_WANTON ENDANGERMENT-1ST DEGREE-POLICE OFFICER'),
 (7.278257775265721, 'UOR_DESC_MENACING'),
 (7.115890944528359,
  'UOR_DESC_ASSAULT 3RD - PEACE OFFICER -  COMMUNICABLE BODILY FLUID'),
 (6.672442668229705,
  'UOR_DESC_ASSAULT 3RD - PEACE OFFICER - NON COMMUNICABLE BODILY FLUID'),
 (6.529842723515735, 'UOR_DESC_OBSTRUCTING GOVERNMENTAL OPERATIONS'),
 (6.235035842047279, 'UOR_DESC_WANTON ENDANGERMENT-2ND DEGREE-POLICE OFFICER'),
 (6.179319298075826, 'UOR_DESC_MURDER - POLICE OFFICER ATTEMPTED'),
 (5.500582095751935, 'UOR_DESC_DISARMING A PEACE OFFICER'),
 (4.681078728949549, 'PREMISE_TYPE_JAIL / PENITENTARY'),
 (4.544767994257792, 'UOR_DESC_TERRORISTIC THREATENING 3RD DEGREE'),
 (4.201029084495879, 'PREMISE_TYPE_HIGHWAY / ROAD / ALLEY'),
 (4.068975304700449, 'UOR_DESC_ASSAULT - 2ND DEGREE - POLICE OFFICER'),
 (3.8630274904236326, 'UOR_DESC_ASSAULT - 1ST DEGREE - 