In [1]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import datetime

In [21]:
os.chdir('/Users/krish/Downloads/')
crimes = pd.read_csv('Crimes_-_2001_to_Present_20231211.csv')
crimes.shape

(4696658, 30)

In [22]:
crimes.dtypes

ID                              int64
Case Number                    object
Date                           object
Block                          object
IUCR                           object
Primary Type                   object
Description                    object
Location Description           object
Arrest                           bool
Domestic                         bool
Beat                            int64
District                      float64
Ward                          float64
Community Area                float64
FBI Code                       object
X Coordinate                  float64
Y Coordinate                  float64
Year                            int64
Updated On                     object
Latitude                      float64
Longitude                     float64
Location                       object
Historical Wards 2003-2015    float64
Zip Codes                     float64
Community Areas               float64
Census Tracts                 float64
Wards       

In [23]:
crimes.isna().sum()

ID                                0
Case Number                       0
Date                              0
Block                             0
IUCR                              0
Primary Type                      0
Description                       0
Location Description          12053
Arrest                            0
Domestic                          0
Beat                              0
District                         41
Ward                            106
Community Area                  860
FBI Code                          0
X Coordinate                  55279
Y Coordinate                  55279
Year                              0
Updated On                        0
Latitude                      55279
Longitude                     55279
Location                      55279
Historical Wards 2003-2015    70101
Zip Codes                     55279
Community Areas               68089
Census Tracts                 67021
Wards                         68032
Boundaries - ZIP Codes      

In [24]:
column_names = list(crimes.columns)
print(column_names)

['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location', 'Historical Wards 2003-2015', 'Zip Codes', 'Community Areas', 'Census Tracts', 'Wards', 'Boundaries - ZIP Codes', 'Police Districts', 'Police Beats']


### Dropping Unnecessary Columns

In [25]:
crimes = crimes.drop(columns = ['IUCR', 'FBI Code','Updated On','Latitude', 'Longitude', 'Location',   
                                'Historical Wards 2003-2015', 'Zip Codes', 'Community Areas', 'Census Tracts',
                                'Wards', 'Boundaries - ZIP Codes', 'Police Districts', 'Police Beats' ])

In [26]:
crimes.set_index("ID",inplace=True)


## Feature Engineering

#### Extracting Relevant attributes from Date-Time

In [27]:
crimes['Date'] = pd.to_datetime(crimes['Date'], format='%m/%d/%Y %I:%M:%S %p')
crimes['date'] = crimes['Date'].dt.date
crimes['Time'] = crimes['Date'].dt.time
crimes['Hour'] = crimes['Date'].dt.hour
crimes['Month'] = crimes['Date'].dt.month
crimes['Weekday'] = crimes['Date'].dt.weekday

crimes = crimes.drop(columns = ['Date', 'Time'])

In [28]:
crimes = crimes.dropna()
print(crimes.shape)

(4632808, 18)


In [29]:
crimes.Arrest = crimes.Arrest.astype("str")
crimes.Arrest = crimes.Arrest.str.replace("False","0")
crimes.Arrest = crimes.Arrest.str.replace("True","1")
crimes.Arrest = crimes.Arrest.astype("int")

In [30]:
crimes.Domestic = crimes.Domestic.astype("str")
crimes.Domestic = crimes.Domestic.str.replace("False","0")
crimes.Domestic = crimes.Domestic.str.replace("True","1")
crimes.Domestic = crimes.Domestic.astype("int")

In [31]:
crimes.head()

Unnamed: 0_level_0,Case Number,Block,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Year,date,Hour,Month,Weekday
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6020368,HP117521,076XX S EAST END AVE,OFFENSE INVOLVING CHILDREN,CHILD ABUSE,APARTMENT,0,1,414,4.0,8.0,43.0,1188862.0,1854636.0,2008,2008-01-01,0,1,1
6000402,HP107451,114XX S THROOP ST,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,RESIDENCE,0,0,2234,22.0,34.0,75.0,1169719.0,1828722.0,2008,2008-01-01,0,1,1
6064706,HP162069,001XX E 124TH PL,THEFT,FROM BUILDING,RESIDENCE,0,0,532,5.0,9.0,53.0,1179358.0,1822314.0,2008,2008-01-01,0,1,1
6025760,HP129015,003XX N STATE ST,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,0,1,1831,18.0,42.0,32.0,1176300.0,1902537.0,2008,2008-01-01,0,1,1
6073869,HP173211,010XX E 101ST ST,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,0,0,511,5.0,8.0,50.0,1184785.0,1838023.0,2008,2008-01-01,0,1,1


### Grouping Crimes based on US Crime Topology

In [32]:
crimes['Primary Type'].unique()

array(['OFFENSE INVOLVING CHILDREN', 'DECEPTIVE PRACTICE', 'THEFT',
       'OTHER OFFENSE', 'CRIMINAL DAMAGE', 'BATTERY', 'BURGLARY',
       'MOTOR VEHICLE THEFT', 'ROBBERY', 'CRIM SEXUAL ASSAULT',
       'SEX OFFENSE', 'CRIMINAL TRESPASS', 'ASSAULT',
       'INTERFERENCE WITH PUBLIC OFFICER', 'WEAPONS VIOLATION',
       'NARCOTICS', 'PUBLIC PEACE VIOLATION', 'LIQUOR LAW VIOLATION',
       'KIDNAPPING', 'PROSTITUTION', 'HOMICIDE', 'STALKING', 'ARSON',
       'GAMBLING', 'INTIMIDATION', 'OBSCENITY', 'CRIMINAL SEXUAL ASSAULT',
       'PUBLIC INDECENCY', 'OTHER NARCOTIC VIOLATION', 'NON-CRIMINAL',
       'NON-CRIMINAL (SUBJECT SPECIFIED)', 'HUMAN TRAFFICKING',
       'NON - CRIMINAL', 'CONCEALED CARRY LICENSE VIOLATION', 'RITUALISM'],
      dtype=object)

In [33]:
crime_mapping = {
    'THEFT': ['THEFT', 'MOTOR VEHICLE THEFT', 'BURGLARY', 'ROBBERY'],
    'VIOLENCE': ['BATTERY', 'ASSAULT', 'HOMICIDE', 'CRIMINAL SEXUAL ASSAULT', 'KIDNAPPING', 'CRIM SEXUAL ASSAULT'],
    'PROPERTY DAMAGE': ['CRIMINAL DAMAGE', 'ARSON'],
    'DECEPTIVE PRACTICE': ['DECEPTIVE PRACTICE'],
    'DRUGS RELATED': ['NARCOTICS', 'OTHER NARCOTIC VIOLATION'],
    'WEAPONS VIOLATION': ['WEAPONS VIOLATION'],
    'TRESPASSING': ['CRIMINAL TRESPASS'],
    'OFFENSE INVOLVING CHILDREN': ['OFFENSE INVOLVING CHILDREN'],
    'SEX OFFENSE': ['SEX OFFENSE', 'PROSTITUTION'],
    'PUBLIC ORDER': ['PUBLIC PEACE VIOLATION', 'INTERFERENCE WITH PUBLIC OFFICER', 'PUBLIC INDECENCY', 'INTIMIDATION', 'STALKING'],
    'LIQOUR AND GAMBLING': ['LIQUOR LAW VIOLATION', 'GAMBLING', 'OBSCENITY', 'CONCEALED CARRY LICENSE VIOLATION'],
    'MISCELLANEOUS': ['OTHER OFFENSE', 'NON-CRIMINAL',
                      'NON-CRIMINAL (SUBJECT SPECIFIED)', 'RITUALISM', 'HUMAN TRAFFICKING']
}

# Function to map the primary type to a category
def map_category(primary_type):
    for category, types in crime_mapping.items():
        if primary_type in types:
            return category
    return 'Other'  # Default category for unmatched types

# Apply the mapping function to create a new column
crimes['Crime Type'] = crimes['Primary Type'].apply(map_category)



In [34]:
crimes.head()

Unnamed: 0_level_0,Case Number,Block,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,X Coordinate,Y Coordinate,Year,date,Hour,Month,Weekday,Crime Type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
6020368,HP117521,076XX S EAST END AVE,OFFENSE INVOLVING CHILDREN,CHILD ABUSE,APARTMENT,0,1,414,4.0,8.0,43.0,1188862.0,1854636.0,2008,2008-01-01,0,1,1,OFFENSE INVOLVING CHILDREN
6000402,HP107451,114XX S THROOP ST,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,RESIDENCE,0,0,2234,22.0,34.0,75.0,1169719.0,1828722.0,2008,2008-01-01,0,1,1,DECEPTIVE PRACTICE
6064706,HP162069,001XX E 124TH PL,THEFT,FROM BUILDING,RESIDENCE,0,0,532,5.0,9.0,53.0,1179358.0,1822314.0,2008,2008-01-01,0,1,1,THEFT
6025760,HP129015,003XX N STATE ST,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,0,1,1831,18.0,42.0,32.0,1176300.0,1902537.0,2008,2008-01-01,0,1,1,MISCELLANEOUS
6073869,HP173211,010XX E 101ST ST,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,0,0,511,5.0,8.0,50.0,1184785.0,1838023.0,2008,2008-01-01,0,1,1,THEFT


## Grouping Crimes per year

In [35]:
#Crime Rate 

crime_per_date = crimes.groupby('Year').size().reset_index(name='count').sort_values('Year')
crime_per_date


Unnamed: 0,Year,count
0,2008,419655
1,2009,385764
2,2010,369777
3,2011,350891
4,2012,335117
5,2013,306360
6,2014,273638
7,2015,257788
8,2016,266486
9,2017,264153


In [None]:
crimes.to_csv('crimes-filtered', index=False)