<a href="https://www.kaggle.com/code/khoatran311/cambridge-crime-data-feature-engineering?scriptVersionId=190206380" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Introduction

The following notebook is an edited version of this [notebook](https://www.kaggle.com/code/naniruddhan/crimeanalysis-featureengineering). Primarily, I seek to preserve the combination of both hour AND minute in the time features which the original notebook did not. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('/kaggle/input/cambridge-crime-data-2009-2024/Crime_Reports_20240701.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95923 entries, 0 to 95922
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   File Number      95923 non-null  object 
 1   Date of Report   95923 non-null  object 
 2   Crime Date Time  95912 non-null  object 
 3   Crime            95923 non-null  object 
 4   Reporting Area   95915 non-null  float64
 5   Neighborhood     95915 non-null  object 
 6   Location         95628 non-null  object 
dtypes: float64(1), object(6)
memory usage: 5.1+ MB


In [4]:
data.columns = data.columns.str.replace(" ", "_")
data.columns = data.columns.str.lower()

In [5]:
data.head(10)

Unnamed: 0,file_number,date_of_report,crime_date_time,crime,reporting_area,neighborhood,location
0,2009-01323,02/21/2009 09:53:00 AM,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA"
1,2009-01324,02/21/2009 09:59:00 AM,02/20/2009 22:30 - 02/21/2009 10:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA"
2,2009-01327,02/21/2009 12:32:00 PM,02/19/2009 21:00 - 02/21/2009 12:00,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA"
3,2009-01331,02/21/2009 03:05:00 PM,02/21/2009 15:00 - 15:10,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA"
4,2009-01346,02/22/2009 05:02:00 AM,02/22/2009 05:02,OUI,105.0,East Cambridge,"FIFTH ST & GORE ST, Cambridge, MA"
5,2009-01357,02/22/2009 09:39:00 PM,02/22/2009 21:39 - 21:45,Aggravated Assault,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA"
6,2009-01363,02/23/2009 10:19:00 AM,02/20/2009 20:00 - 02/23/2009 10:00,Commercial Break,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA"
7,2009-01365,02/23/2009 11:24:00 AM,02/23/2009 11:00 - 11:20,Street Robbery,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA"
8,2009-01385,02/23/2009 08:16:00 PM,02/23/2009 20:16,Housebreak,1108.0,North Cambridge,"100 CLIFTON ST, Cambridge, MA"
9,2009-01391,02/24/2009 09:02:00 AM,02/22/2009 19:30 - 02/23/2009 20:30,Hit and Run,105.0,East Cambridge,"100 SEVENTH ST, Cambridge, MA"


In [6]:
data.isna().sum()

file_number          0
date_of_report       0
crime_date_time     11
crime                0
reporting_area       8
neighborhood         8
location           295
dtype: int64

In [7]:
data = data.loc[-data.crime_date_time.isna() &
                -data.reporting_area.isna()]

data.location.replace({np.nan:"UNKNOWN"},
                      inplace=True)

In [8]:
data.isna().sum()

file_number        0
date_of_report     0
crime_date_time    0
crime              0
reporting_area     0
neighborhood       0
location           0
dtype: int64

# Process date_of_report

In [9]:
data['report_date'] = data.date_of_report.str.split(' ',n=1,expand=True)[0]
data['report_time'] = data.date_of_report.str.split(' ',n=1,expand=True)[1]

In [10]:
data.head(2)

Unnamed: 0,file_number,date_of_report,crime_date_time,crime,reporting_area,neighborhood,location,report_date,report_time
0,2009-01323,02/21/2009 09:53:00 AM,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",02/21/2009,09:53:00 AM
1,2009-01324,02/21/2009 09:59:00 AM,02/20/2009 22:30 - 02/21/2009 10:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",02/21/2009,09:59:00 AM


In [11]:
def convert_time(time):
    """
    Returns time in a decimal format. 
    4:35 PM ==> 16.35
    """
    hour = int(time.split(':')[0])
    minute = int(time.split(":")[1])/100
    
    if 'PM' in time:
        if hour < 12:
            return hour+12 + minute 
        return hour+minute
    else:
        if hour == 12:
            return minute ## 12 AM is really 0 AM, then 1 AM, ...
        return hour+minute

data['report_time'] = data['report_time'].apply(convert_time)

In [12]:
data.head(5)

Unnamed: 0,file_number,date_of_report,crime_date_time,crime,reporting_area,neighborhood,location,report_date,report_time
0,2009-01323,02/21/2009 09:53:00 AM,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",02/21/2009,9.53
1,2009-01324,02/21/2009 09:59:00 AM,02/20/2009 22:30 - 02/21/2009 10:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",02/21/2009,9.59
2,2009-01327,02/21/2009 12:32:00 PM,02/19/2009 21:00 - 02/21/2009 12:00,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",02/21/2009,12.32
3,2009-01331,02/21/2009 03:05:00 PM,02/21/2009 15:00 - 15:10,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA",02/21/2009,15.05
4,2009-01346,02/22/2009 05:02:00 AM,02/22/2009 05:02,OUI,105.0,East Cambridge,"FIFTH ST & GORE ST, Cambridge, MA",02/22/2009,5.02


In [13]:
data['report_date'] = pd.to_datetime(data['report_date'])
data['report_day']  = data['report_date'].dt.day
data['report_month'] = data['report_date'].dt.month
data['report_year']  = data['report_date'].dt.year

In [14]:
data.drop(columns=['date_of_report'],
         inplace=True)

In [15]:
data.head(10)

Unnamed: 0,file_number,crime_date_time,crime,reporting_area,neighborhood,location,report_date,report_time,report_day,report_month,report_year
0,2009-01323,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",2009-02-21,9.53,21,2,2009
1,2009-01324,02/20/2009 22:30 - 02/21/2009 10:00,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",2009-02-21,9.59,21,2,2009
2,2009-01327,02/19/2009 21:00 - 02/21/2009 12:00,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",2009-02-21,12.32,21,2,2009
3,2009-01331,02/21/2009 15:00 - 15:10,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA",2009-02-21,15.05,21,2,2009
4,2009-01346,02/22/2009 05:02,OUI,105.0,East Cambridge,"FIFTH ST & GORE ST, Cambridge, MA",2009-02-22,5.02,22,2,2009
5,2009-01357,02/22/2009 21:39 - 21:45,Aggravated Assault,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",2009-02-22,21.39,22,2,2009
6,2009-01363,02/20/2009 20:00 - 02/23/2009 10:00,Commercial Break,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA",2009-02-23,10.19,23,2,2009
7,2009-01365,02/23/2009 11:00 - 11:20,Street Robbery,501.0,Cambridgeport,"600 Massachusetts Ave, Cambridge, MA",2009-02-23,11.24,23,2,2009
8,2009-01385,02/23/2009 20:16,Housebreak,1108.0,North Cambridge,"100 CLIFTON ST, Cambridge, MA",2009-02-23,20.16,23,2,2009
9,2009-01391,02/22/2009 19:30 - 02/23/2009 20:30,Hit and Run,105.0,East Cambridge,"100 SEVENTH ST, Cambridge, MA",2009-02-24,9.02,24,2,2009


# Process Crime Date Time

In [16]:
data['crime_date'] = data.crime_date_time.str.split(' ',n=1,expand=True)[0] ## First date of crime
data['crime_time'] = data.crime_date_time.str.split(' ',n=1,expand=True)[1] ## Extracts a mix of time and second date
data['crime_time'] = data['crime_time'].str.split('-',expand=True)[0] ## Selects first time of crime

In [17]:
def format_time(time):
    ### Some time values are empty strings OR None (15 entries total)
    if time=="" or time is None: return np.nan 
    
    hour = int(time.split(":")[0])
    minute = int(time.split(":")[1])/100
    return hour+minute

data.crime_time = data.crime_time.apply(format_time)
data.dropna(inplace=True)

In [18]:
data.head(1)

Unnamed: 0,file_number,crime_date_time,crime,reporting_area,neighborhood,location,report_date,report_time,report_day,report_month,report_year,crime_date,crime_time
0,2009-01323,02/21/2009 09:20 - 09:30,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",2009-02-21,9.53,21,2,2009,02/21/2009,9.2


In [19]:
data['crime_date'] = pd.to_datetime(data['crime_date'])
data['crime_day']  = data['crime_date'].dt.day
data['crime_month'] = data['crime_date'].dt.month
data['crime_year']  = data['crime_date'].dt.year

In [20]:
data.drop('crime_date_time',axis=1,inplace=True)

# Encode Features

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data['crime_encoded'] = le.fit_transform(data['crime'])
data['neighborhood_encoded'] = le.fit_transform(data['neighborhood'])
data['location_encoded'] = le.fit_transform(data['location'])

In [22]:
data.head(5)

Unnamed: 0,file_number,crime,reporting_area,neighborhood,location,report_date,report_time,report_day,report_month,report_year,crime_date,crime_time,crime_day,crime_month,crime_year,crime_encoded,neighborhood_encoded,location_encoded
0,2009-01323,Threats,105.0,East Cambridge,"100 OTIS ST, Cambridge, MA",2009-02-21,9.53,21,2,2009,2009-02-21,9.2,21,2,2009,48,3,1401
1,2009-01324,Auto Theft,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",2009-02-21,9.59,21,2,2009,2009-02-20,22.3,20,2,2009,5,8,1919
2,2009-01327,Hit and Run,1109.0,North Cambridge,"400 RINDGE AVE, Cambridge, MA",2009-02-21,12.32,21,2,2009,2009-02-19,21.0,19,2,2009,19,8,1919
3,2009-01331,Larceny (Misc),1303.0,Strawberry Hill,"0 NORUMBEGA ST, Cambridge, MA",2009-02-21,15.05,21,2,2009,2009-02-21,15.0,21,2,2009,24,11,762
4,2009-01346,OUI,105.0,East Cambridge,"FIFTH ST & GORE ST, Cambridge, MA",2009-02-22,5.02,22,2,2009,2009-02-22,5.02,22,2,2009,36,3,3232


# Export Final Dataframe 

In [23]:
data.to_csv("cambridge_crime_clean.csv", index=False)