## Libraries for Modeling

In [25]:
# Libraries
import pandas as pd
import numpy as np
from datetime import datetime, time

## Read CSV File

In [26]:
csv_file_path = '/Users/lindseysullivan/Documents/School/Kansas-City-Crime-Analysis/Data/Data_Sets/KCPD-5-Year-Analysis-Feature-Eng.CSV'
df = pd.read_csv(csv_file_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445289 entries, 0 to 445288
Data columns (total 15 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   Report_No                        445289 non-null  object
 1   Year                             445289 non-null  int64 
 2   Quarter                          445289 non-null  int64 
 3   Month                            445289 non-null  object
 4   Day_of_Week                      445289 non-null  object
 5   From_Date                        445289 non-null  object
 6   From_Time                        445289 non-null  object
 7   Adjusted_To_Date                 445289 non-null  object
 8   Adjusted_To_Time                 445289 non-null  object
 9   General _Offense_Categorization  445289 non-null  object
 10  Type_of_Crime                    445289 non-null  object
 11  UCR_Offense_Classification       445289 non-null  object
 12  NIBRS           

In [27]:
print(df.head(n=5))

    Report_No  Year  Quarter  Month Day_of_Week From_Date From_Time  \
0  KC19020397  2019        1  March   Wednesday   3/20/19  09:00:00   
1  KC19025235  2019        2  April      Sunday    4/7/19  15:45:00   
2  KC19036511  2019        2    May    Thursday   5/16/19  20:30:00   
3  KC19024315  2019        2  April   Wednesday    4/3/19  21:30:00   
4  KC19035992  2019        2    May    Thursday   5/16/19  08:15:00   

  Adjusted_To_Date Adjusted_To_Time General _Offense_Categorization  \
0          3/20/19         13:55:00              Aggravated Assault   
1           4/7/19         15:52:00              Aggravated Assault   
2          5/16/19         21:00:00                         Robbery   
3           4/4/19         01:30:00              Aggravated Assault   
4          5/16/19         08:17:00                  Other Assaults   

       Type_of_Crime UCR_Offense_Classification NIBRS NIBRS Offense Group  \
0      Violent Crime                     Part I   13A             Gro

## Binary Encoding
Binary Encoding presents each category as a binary digit.

**Type_of_Crime** will be handled this way:
**Violent Crime** = 1
**Non-Violent Crime** = 0

In [None]:
df['Is_Violent'] = df['Type_of_Crime'].apply(lambda x: 1 if x == 'Violent Crime' else 0)

## Create Feature - Duration
Represents the length of time in minutes from start to the end time of an event. 

In [28]:
# Convert to datetime
df['From_DateTime'] = pd.to_datetime(df['From_Date'] + ' ' + df['From_Time'])
df['Adjusted_To_DateTime'] = pd.to_datetime(df['Adjusted_To_Date'] + ' ' + df['Adjusted_To_Time'])

# Calculate duration
df['Duration'] = (df['Adjusted_To_DateTime'] - df['From_DateTime']).dt.total_seconds() / 60

# View the DataFrame
print(df[['From_DateTime', 'Adjusted_To_DateTime', 'Duration']])


             From_DateTime Adjusted_To_DateTime  Duration
0      2019-03-20 09:00:00  2019-03-20 13:55:00     295.0
1      2019-04-07 15:45:00  2019-04-07 15:52:00       7.0
2      2019-05-16 20:30:00  2019-05-16 21:00:00      30.0
3      2019-04-03 21:30:00  2019-04-04 01:30:00     240.0
4      2019-05-16 08:15:00  2019-05-16 08:17:00       2.0
...                    ...                  ...       ...
445284 2023-10-08 22:12:00  2023-10-08 22:13:00       1.0
445285 2023-11-01 22:00:00  2023-11-02 19:00:00    1260.0
445286 2023-10-08 23:19:00  2023-10-08 23:19:00       0.0
445287 2023-10-17 14:47:00  2023-10-17 14:46:00      -1.0
445288 2023-10-28 00:00:00  2023-10-30 00:00:00    2880.0

[445289 rows x 3 columns]


## Extract Hour to Capture Time of Day

In [29]:
# Extract hour from From_Time & Adjusted_To_Time
df['From_Time'] = pd.to_datetime(df['From_Time'],format='%H:%M:%S').dt.hour
df['Adjusted_To_Time'] = pd.to_datetime(df['Adjusted_To_Time'], format='%H:%M:%S').dt.hour

print(df.head(n=5))

    Report_No  Year  Quarter  Month Day_of_Week From_Date  From_Time  \
0  KC19020397  2019        1  March   Wednesday   3/20/19          9   
1  KC19025235  2019        2  April      Sunday    4/7/19         15   
2  KC19036511  2019        2    May    Thursday   5/16/19         20   
3  KC19024315  2019        2  April   Wednesday    4/3/19         21   
4  KC19035992  2019        2    May    Thursday   5/16/19          8   

  Adjusted_To_Date  Adjusted_To_Time General _Offense_Categorization  \
0          3/20/19                13              Aggravated Assault   
1           4/7/19                15              Aggravated Assault   
2          5/16/19                21                         Robbery   
3           4/4/19                 1              Aggravated Assault   
4          5/16/19                 8                  Other Assaults   

       Type_of_Crime UCR_Offense_Classification NIBRS NIBRS Offense Group  \
0      Violent Crime                     Part I   13A    

## Cyclical Encoding
Cyclical Encoding of the 'Months' column into sine & cosine values as new columns in the DataFrame.

In [32]:
# Dictionary to map Month string to numerical value
month_to_num = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,
    'July': 7,'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
}

# Map the 'Month' column to numeric value
df['Month'] = df['Month'].map(month_to_num)

# Dictionary to map Day string to numerical value
day_to_num = {
    'Sunday': 0, 'Monday': 1, 'Tuesday': 2, 'Wednesday': 3,
    'Thursday': 4, 'Friday': 5, 'Saturday': 6
}

# Map the 'Day_of_Week' column to numeric value
df['Day_of_Week'] = df['Day_of_Week'].map(day_to_num)

# Apply cyclical encoding to Months
df['sin_Months'] = np.sin(2 * np.pi * df['Month']/12.0)
df['cos_Months'] = np.cos(2 * np.pi * df['Month']/12.0)

# Apply cyclical encoding to Day of Week for Weekday vs. Weekend Analysis
df['sin_DayOfWeek'] = np.sin(2 * np.pi * df['Day_of_Week']/7.0)
df['cos_DayOfWeek'] = np.cos(2 * np.pi * df['Day_of_Week']/7.0)

# Apply cyclical encoding to From_Time
df['sin_From_Time'] = np.sin(2 * np.pi * df['From_Time']/24)
df['cos_From_Time'] = np.cos(2 * np.pi * df['From_Time']/24)

# Apply cyclical encoding to Adjusted_To_Time
df['sin_Adj_To_Time'] = np.sin(2 * np.pi * df['Adjusted_To_Time']/24)
df['cos_Adj_To_Time'] = np.cos(2 * np.pi * df['Adjusted_To_Time']/24)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445289 entries, 0 to 445288
Data columns (total 26 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   Report_No                        445289 non-null  object        
 1   Year                             445289 non-null  int64         
 2   Quarter                          445289 non-null  int64         
 3   Month                            445289 non-null  int64         
 4   Day_of_Week                      445289 non-null  int64         
 5   From_Date                        445289 non-null  object        
 6   From_Time                        445289 non-null  int64         
 7   Adjusted_To_Date                 445289 non-null  object        
 8   Adjusted_To_Time                 445289 non-null  int64         
 9   General _Offense_Categorization  445289 non-null  object        
 10  Type_of_Crime                    445289 non-

## Ordinal Encoding