In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans

In [20]:
# Read in the CSV file and create the Pandas Dataframe
joco_crimes = pd.read_csv(Path("./Resources/merged_data.csv"))
 
# Review the DataFrame
joco_crimes.head()


Unnamed: 0,ID,EventType,Class,UCRDescription,IncidentDescription,TheftDescription,Location,PremiseDescription,APID,UCRCode,...,ReptTime,InactivityDescription,HitRun,Y,X,Name,Compliant,Race,Sex,DOB
0,D23003080,crimes,Property,Counterfeiting/Forgery,FORGERY; POSSESS WITH INTENT TO DISTRIBUTE WRI...,,100 BLOCK N KANSAS AVE,Jail / Prison,2.003041e+17,250,...,832,,,38.883172,-94.821215,,,,,
1,D23003074,crimes,Society,Weapon Law Violations,UNLAWFUL DISCHARGE OF FIREARMS (10.10.5),,24800 BLOCK W 129TH TER,Single Residence (includes attached garage),2.003023e+17,520,...,1948,,,38.893987,-94.873529,,,,,
2,D23003087,crimes,Society,All Other Offenses,PROBATION VIOLATION (22-3716),,100 BLOCK N KANSAS AVE,Jail / Prison,2.003041e+17,90Z,...,1440,,,38.883172,-94.821215,,,,,
3,D23003064,crimes,Society,Drug/Narcotic Offenses-Drug/Narcotic Violations,POSSESSION OF MARIJUANA (21-5706.b3.c3.A),,W K10 HWY,Highway,,35A,...,1712,,,38.960213,-94.955056,,,,,
4,D23003086,crimes,Society,All Other Offenses,PROBATION VIOLATION (22-3716),,100 BLOCK N KANSAS AVE,Jail / Prison,2.003041e+17,90Z,...,1729,,,38.883172,-94.821215,,,,,


In [21]:
joco_crimes.shape

(2997, 22)

In [22]:
# Checking the null values and the Datatype
joco_crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2997 entries, 0 to 2996
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     2997 non-null   object 
 1   EventType              2997 non-null   object 
 2   Class                  2039 non-null   object 
 3   UCRDescription         2039 non-null   object 
 4   IncidentDescription    2996 non-null   object 
 5   TheftDescription       95 non-null     object 
 6   Location               2997 non-null   object 
 7   PremiseDescription     2036 non-null   object 
 8   APID                   1945 non-null   float64
 9   UCRCode                2996 non-null   object 
 10  Agency                 2997 non-null   object 
 11  ReptDate               2997 non-null   object 
 12  ReptTime               2997 non-null   int64  
 13  InactivityDescription  0 non-null      float64
 14  HitRun                 0 non-null      float64
 15  Y   

In [70]:
# Droping all the columns that are empty (0 non-null)
joco_crimes_df = joco_crimes.drop(columns=["Name", "Compliant","Race","Sex","DOB",\
                                           "InactivityDescription", "HitRun","APID",\
                                           "TheftDescription","Agency","EventType"])


In [72]:
# Droping the rows with null values 
joco_crimes_df = joco_crimes_df.dropna()
joco_crimes_df.tail()

Unnamed: 0,ID,Class,UCRDescription,IncidentDescription,Location,PremiseDescription,UCRCode,ReptDate,ReptTime,Y,X
2987,D22002293,Society,All Other Offenses,FAILURE TO APPEAR (21-5915.a),700 BLOCK N 7TH ST,Jail / Prison,90Z,2022-06-28,1120,39.024753,-94.640914
2988,D22002294,Society,All Other Offenses,FAILURE TO APPEAR (21-5915.a),700 BLOCK N 7TH ST,Jail / Prison,90Z,2022-06-28,1120,39.024753,-94.640914
2989,D22002282,Society,All Other Offenses,FAILURE TO APPEAR (21-5915.a),700 BLOCK N 7TH ST,Jail / Prison,90Z,2022-06-28,1017,38.615399,-94.684003
2990,D22002250,Society,All Other Offenses,Flee or attempt to elude LEO by engaging in re...,W 197TH ST,City Street,90Z,2022-06-28,441,38.771475,-94.670614
2991,D22002280,Society,All Other Offenses,RACING ON HIGHWAYS (08-1565),W 175TH ST,Highway,90Z,2022-06-28,28,38.810943,-94.821198


In [73]:
joco_crimes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2032 entries, 0 to 2991
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2032 non-null   object 
 1   Class                2032 non-null   object 
 2   UCRDescription       2032 non-null   object 
 3   IncidentDescription  2032 non-null   object 
 4   Location             2032 non-null   object 
 5   PremiseDescription   2032 non-null   object 
 6   UCRCode              2032 non-null   object 
 7   ReptDate             2032 non-null   object 
 8   ReptTime             2032 non-null   int64  
 9   Y                    2032 non-null   float64
 10  X                    2032 non-null   float64
dtypes: float64(2), int64(1), object(8)
memory usage: 190.5+ KB


In [74]:
# Counting the values for the UCRDescription column
value_cunts = joco_crimes_df.UCRDescription.value_counts()
value_cunts

All Other Offenses                                         906
Drug/Narcotic Offenses-Drug/Narcotic Violations            352
Driving Under the Influence                                195
Assault-Simple Assault                                      91
Destruction/Damage/Vandalism of Property                    90
Larceny/Theft Offenses                                      83
Fraud Offenses-False Pretenses/Swindle/Conf. Game           52
Assault-Intimidation                                        45
Liquor Law Violations                                       45
Drug/Narcotic Offenses-Drug Equipment Violations            43
Trespass of Real Property                                   30
Assault-Aggravated Assault                                  16
Burglary/Breaking & Entering                                15
Counterfeiting/Forgery                                       9
Sex Offenses Forcible-Forcible Fondling                      8
Disorderly Conduct                                     

In [81]:
# We're filtering by rows with value_counts >= 30
undroped = joco_crimes_df.UCRDescription.isin(value_cunts[value_cunts >= 30].index)


In [82]:
# Aply a boolean indexing 
joco_crimes_df_filtered = joco_crimes_df[undroped]

In [83]:
joco_crimes_df_filtered.head()

Unnamed: 0,ID,Class,UCRDescription,IncidentDescription,Location,PremiseDescription,UCRCode,ReptDate,ReptTime,Y,X
2,D23003087,Society,All Other Offenses,PROBATION VIOLATION (22-3716),100 BLOCK N KANSAS AVE,Jail / Prison,90Z,2023-09-03,1440,38.883172,-94.821215
3,D23003064,Society,Drug/Narcotic Offenses-Drug/Narcotic Violations,POSSESSION OF MARIJUANA (21-5706.b3.c3.A),W K10 HWY,Highway,35A,2023-09-02,1712,38.960213,-94.955056
4,D23003086,Society,All Other Offenses,PROBATION VIOLATION (22-3716),100 BLOCK N KANSAS AVE,Jail / Prison,90Z,2023-09-01,1729,38.883172,-94.821215
5,D23003085,Society,All Other Offenses,FAILURE TO APPEAR (21-5915.a),6600 BLOCK CRAIG RD,Single Residence (includes attached garage),90Z,2023-09-01,1459,39.009349,-94.678366
8,D23003053,Society,Drug/Narcotic Offenses-Drug/Narcotic Violations,POSSESSION OF MARIJUANA (21-5706.b3.c3.A),E K10 HWY,Highway,35A,2023-09-01,1319,38.947513,-94.885671


In [88]:
joco_crimes_df_filtered.UCRDescription.value_counts()

All Other Offenses                                   906
Drug/Narcotic Offenses-Drug/Narcotic Violations      352
Driving Under the Influence                          195
Assault-Simple Assault                                91
Destruction/Damage/Vandalism of Property              90
Larceny/Theft Offenses                                83
Fraud Offenses-False Pretenses/Swindle/Conf. Game     52
Assault-Intimidation                                  45
Liquor Law Violations                                 45
Drug/Narcotic Offenses-Drug Equipment Violations      43
Trespass of Real Property                             30
Name: UCRDescription, dtype: int64

In [87]:
joco_crimes_df_filtered.UCRDescription.unique()

array(['All Other Offenses',
       'Drug/Narcotic Offenses-Drug/Narcotic Violations',
       'Larceny/Theft Offenses', 'Trespass of Real Property',
       'Fraud Offenses-False Pretenses/Swindle/Conf. Game',
       'Drug/Narcotic Offenses-Drug Equipment Violations',
       'Assault-Intimidation', 'Driving Under the Influence',
       'Assault-Simple Assault', 'Liquor Law Violations',
       'Destruction/Damage/Vandalism of Property'], dtype=object)

In [89]:
# Create a mapping for categories to be combined and their new names

category_mapping = {
    'All Other Offenses': 'Other offences',
    'Drug/Narcotic Offenses-Drug/Narcotic Violations' : 'Drug',
    'Larceny/Theft Offenses' : 'Theft',
    'Trespass of Real Property' : 'Property',
    'Fraud Offenses-False Pretenses/Swindle/Conf. Game' : 'Fraud',
    'Drug/Narcotic Offenses-Drug Equipment Violations' : 'Drug',
    'Assault-Intimidation' : 'Assault',
    'Driving Under the Influence' : 'DUI',
    'Assault-Simple Assault' : 'Assault', 
    'Liquor Law Violations' : 'Liquor',
    'Destruction/Damage/Vandalism of Property': 'Property'
}

In [90]:
# Apply the mapping to create a new 'combined_category' column
joco_crimes_df_filtered['combined_category'] = joco_crimes_df_filtered.UCRDescription.map(category_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joco_crimes_df_filtered['combined_category'] = joco_crimes_df_filtered.UCRDescription.map(category_mapping)


In [91]:
joco_crimes_df_filtered.head()

Unnamed: 0,ID,Class,UCRDescription,IncidentDescription,Location,PremiseDescription,UCRCode,ReptDate,ReptTime,Y,X,combined_category
2,D23003087,Society,All Other Offenses,PROBATION VIOLATION (22-3716),100 BLOCK N KANSAS AVE,Jail / Prison,90Z,2023-09-03,1440,38.883172,-94.821215,Other offences
3,D23003064,Society,Drug/Narcotic Offenses-Drug/Narcotic Violations,POSSESSION OF MARIJUANA (21-5706.b3.c3.A),W K10 HWY,Highway,35A,2023-09-02,1712,38.960213,-94.955056,Drug
4,D23003086,Society,All Other Offenses,PROBATION VIOLATION (22-3716),100 BLOCK N KANSAS AVE,Jail / Prison,90Z,2023-09-01,1729,38.883172,-94.821215,Other offences
5,D23003085,Society,All Other Offenses,FAILURE TO APPEAR (21-5915.a),6600 BLOCK CRAIG RD,Single Residence (includes attached garage),90Z,2023-09-01,1459,39.009349,-94.678366,Other offences
8,D23003053,Society,Drug/Narcotic Offenses-Drug/Narcotic Violations,POSSESSION OF MARIJUANA (21-5706.b3.c3.A),E K10 HWY,Highway,35A,2023-09-01,1319,38.947513,-94.885671,Drug
