In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
data = pd.read_csv("/kaggle/input/dsci-560/LA_Crime_Data_2020_2023.csv")
data.head()

Unnamed: 0,RecNo,ReportDate,DateOCC,TimeOCC,Area,AreaName,DistrictNo,CrimeCode,CrmDesc,Status,StatusDesc,Location,LAT,LON,TimeStamp
0,10304468,01/08/2020 12:00:00 AM,01/08/2020 12:00:00 AM,2230,3,Southwest,377,624,BATTERY - SIMPLE ASSAULT,AO,Adult Other,1100 W 39TH PL,34.0141,-118.2978,2020010822
1,190101086,01/02/2020 12:00:00 AM,01/01/2020 12:00:00 AM,330,1,Central,163,624,BATTERY - SIMPLE ASSAULT,IC,Invest Cont,700 S HILL ST,34.0459,-118.2545,2020010103
2,191501505,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,1730,15,N Hollywood,1543,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),IC,Invest Cont,5400 CORTEEN PL,34.1685,-118.4019,2020010117
3,191921269,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,415,19,Mission,1998,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",IC,Invest Cont,14400 TITUS ST,34.2198,-118.4468,2020010104
4,200100502,01/02/2020 12:00:00 AM,01/02/2020 12:00:00 AM,1315,1,Central,161,442,SHOPLIFTING - PETTY THEFT ($950 & UNDER),IC,Invest Cont,700 S FIGUEROA ST,34.0483,-118.2631,2020010213


In [3]:
categorical = [col for col in data.columns if data[col].dtypes == 'O']

categorical

['ReportDate',
 'DateOCC',
 'AreaName',
 'CrmDesc',
 'Status',
 'StatusDesc',
 'Location']

In [4]:
numerical = [col for col in data.columns if data[col].dtypes != 'O']

numerical

['RecNo',
 'TimeOCC',
 'Area',
 'DistrictNo',
 'CrimeCode',
 'LAT',
 'LON',
 'TimeStamp']

In [5]:
data.isnull().sum()

RecNo         0
ReportDate    0
DateOCC       0
TimeOCC       0
Area          0
AreaName      0
DistrictNo    0
CrimeCode     0
CrmDesc       0
Status        0
StatusDesc    0
Location      0
LAT           0
LON           0
TimeStamp     0
dtype: int64

In [6]:
data = data.dropna(axis = 0)

In [7]:
data.isnull().sum()

RecNo         0
ReportDate    0
DateOCC       0
TimeOCC       0
Area          0
AreaName      0
DistrictNo    0
CrimeCode     0
CrmDesc       0
Status        0
StatusDesc    0
Location      0
LAT           0
LON           0
TimeStamp     0
dtype: int64

## select X and Y

In [8]:
data['CrmDesc'].unique()

array(['BATTERY - SIMPLE ASSAULT',
       'VANDALISM - MISDEAMEANOR ($399 OR UNDER)',
       'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)',
       'SHOPLIFTING - PETTY THEFT ($950 & UNDER)',
       'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD',
       'BURGLARY FROM VEHICLE', 'CRIMINAL THREATS - NO WEAPON DISPLAYED',
       'INTIMATE PARTNER - SIMPLE ASSAULT',
       'THEFT PLAIN - PETTY ($950 & UNDER)', 'THEFT OF IDENTITY',
       'ROBBERY', 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
       'BURGLARY', 'VEHICLE - STOLEN',
       'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)',
       'BRANDISH WEAPON', 'INTIMATE PARTNER - AGGRAVATED ASSAULT',
       'TRESPASSING',
       'THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)',
       'VIOLATION OF RESTRAINING ORDER'], dtype=object)

In [9]:
dangerous_1=[
    'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
    'ROBBERY',
    'BRANDISH WEAPON', 
    'VIOLATION OF RESTRAINING ORDER',
    'CRIMINAL THREATS - NO WEAPON DISPLAYED',
    'BURGLARY',
    'BURGLARY FROM VEHICLE'
    ]

In [10]:
dangerous_2=['INTIMATE PARTNER - AGGRAVATED ASSAULT',
             'TRESPASSING',
             'VEHICLE - STOLEN',
             'BATTERY - SIMPLE ASSAULT', 
             'INTIMATE PARTNER - SIMPLE ASSAULT',
             'THEFT OF IDENTITY']

In [11]:
dangerous_3=['SHOPLIFTING - PETTY THEFT ($950 & UNDER)',
             'THEFT PLAIN - PETTY ($950 & UNDER)',
             'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)',
             'VANDALISM - MISDEAMEANOR ($399 OR UNDER)', 
             'THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)',
             'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD',
             'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)']

In [12]:
def map_degree(x):
    if x in dangerous_1:
        return 'public danger'
    elif x in dangerous_2:
        return 'privacy danger'
    elif x in dangerous_3:
        return 'economic danger'
    else:
        return None

data['danger_crime']=data['CrmDesc'].apply(map_degree)
data['danger_crime']

0          privacy danger
1          privacy danger
2         economic danger
3         economic danger
4         economic danger
               ...       
566144     privacy danger
566145    economic danger
566146    economic danger
566147    economic danger
566148    economic danger
Name: danger_crime, Length: 566149, dtype: object

In [13]:
data['danger_crime'].unique()

array(['privacy danger', 'economic danger', 'public danger'], dtype=object)

In [14]:
X = data[['LAT','LON', 'TimeStamp']]

y = data['danger_crime']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.2, random_state=42)

## Feature Engineering 

In [16]:
from sklearn.preprocessing import RobustScaler

cols = X_train.columns

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

## Model Traning

In [17]:
# train a Gaussian Naive Bayes classifier on the training set
# instantiate the model
# initializaing the NB
classifer = GaussianNB()
# training the model
classifer.fit(X_train, y_train)
# testing the model
y_pred = classifer.predict(X_test)

In [18]:

y_pred

array(['privacy danger', 'privacy danger', 'privacy danger', ...,
       'privacy danger', 'privacy danger', 'privacy danger'], dtype='<U15')

In [19]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score: 0.3778


In [20]:
print('Training set score: {:.4f}'.format(classifer.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(classifer.score(X_test, y_test)))

Training set score: 0.3758
Test set score: 0.3778


In [21]:
# Print the Confusion Matrix and slice it into four pieces

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[ 6778 35960     0]
 [ 7264 44558     0]
 [ 5808 35508     0]]

True Positives(TP) =  6778

True Negatives(TN) =  44558

False Positives(FP) =  35960

False Negatives(FN) =  7264


In [22]:
print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

economic danger       0.34      0.16      0.22     42738
 privacy danger       0.38      0.86      0.53     51822
  public danger       0.00      0.00      0.00     41316

       accuracy                           0.38    135876
      macro avg       0.24      0.34      0.25    135876
   weighted avg       0.25      0.38      0.27    135876



  _warn_prf(average, modifier, msg_start, len(result))
