<a href="https://colab.research.google.com/github/omo776/safeline/blob/main/Chicago_crime_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Analyzing Chicago Crime From 2010 to 2020
Using an Unsupervised Machine Learning Model

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
try:
    data = pd.read_csv('/content/drive/MyDrive/Crimes.csv', low_memory=False, on_bad_lines='warn')
    data.info()
except Exception as e:
    print(f"Error reading CSV file: {e}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5581 entries, 0 to 5580
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CASE#                   5581 non-null   object 
 1   DATE  OF OCCURRENCE     5581 non-null   object 
 2   BLOCK                   5581 non-null   object 
 3    IUCR                   5581 non-null   object 
 4    PRIMARY DESCRIPTION    5581 non-null   object 
 5    SECONDARY DESCRIPTION  5581 non-null   object 
 6    LOCATION DESCRIPTION   5560 non-null   object 
 7   ARREST                  5580 non-null   object 
 8   DOMESTIC                5580 non-null   object 
 9   BEAT                    5580 non-null   float64
 10  WARD                    5580 non-null   float64
 11  FBI CD                  5580 non-null   object 
 12  X COORDINATE            5573 non-null   float64
 13  Y COORDINATE            5573 non-null   float64
 14  LATITUDE                5573 non-null   

In [55]:
# Cleaning the dataset
# Removing null rows and unwanted columns

data_mod = data.dropna()
columns_to_drop = data.columns[[0, 3]].tolist() + data.columns[9:17].tolist() # Changed from 9:17 to 10:17
data_mod = data.drop(columns=columns_to_drop)
data_mod.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5581 entries, 0 to 5580
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   DATE  OF OCCURRENCE     5581 non-null   object
 1   BLOCK                   5581 non-null   object
 2    PRIMARY DESCRIPTION    5581 non-null   object
 3    SECONDARY DESCRIPTION  5581 non-null   object
 4    LOCATION DESCRIPTION   5560 non-null   object
 5   ARREST                  5580 non-null   object
 6   DOMESTIC                5580 non-null   object
dtypes: object(7)
memory usage: 305.3+ KB


In [56]:
# Rename the columns to remove leading/trailing spaces
data_mod = data_mod.rename(columns={
    'DATE  OF OCCURRENCE': 'DATE OF OCCURRENCE',
    'BLOCK': 'BLOCK',
    ' PRIMARY DESCRIPTION': 'PRIMARY DESCRIPTION',
    ' SECONDARY DESCRIPTION': 'SECONDARY DESCRIPTION',
    ' LOCATION DESCRIPTION': 'LOCATION DESCRIPTION',
    'ARREST': 'ARREST',
    'DOMESTIC': 'DOMESTIC',
})
print(data_mod.columns)

Index(['DATE OF OCCURRENCE', 'BLOCK', 'PRIMARY DESCRIPTION',
       'SECONDARY DESCRIPTION', 'LOCATION DESCRIPTION', 'ARREST', 'DOMESTIC'],
      dtype='object')


In [57]:
# Convert the 'DATE OF OCCURRENCE' column to datetime format
data_mod['DATE OF OCCURRENCE'] = pd.to_datetime(data_mod['DATE OF OCCURRENCE'], errors='coerce')

  data_mod['DATE OF OCCURRENCE'] = pd.to_datetime(data_mod['DATE OF OCCURRENCE'], errors='coerce')


In [58]:
# Drop any rows where the date conversion failed
data_mod = data_mod.dropna(subset=['DATE OF OCCURRENCE'])

In [59]:
# Create additional time-based columns
data_mod['Year'] = data_mod['DATE OF OCCURRENCE'].dt.year
data_mod['Month'] = data_mod['DATE OF OCCURRENCE'].dt.month
data_mod['Day'] = data_mod['DATE OF OCCURRENCE'].dt.day
data_mod['Weekday'] = data_mod['DATE OF OCCURRENCE'].dt.weekday
data_mod['Hour'] = data_mod['DATE OF OCCURRENCE'].dt.hour

In [60]:
# Convert 'ARREST' and 'DOMESTIC' columns to boolean type for consistency
data_mod['ARREST'] = data_mod['ARREST'].astype(bool)
data_mod['DOMESTIC'] = data_mod['DOMESTIC'].astype(bool)

In [61]:
data_mod.head()

Unnamed: 0,DATE OF OCCURRENCE,BLOCK,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,Year,Month,Day,Weekday,Hour
0,2024-03-07 18:38:00,021XX S PRINCETON AVE,CRIMINAL DAMAGE,TO VEHICLE,NURSING / RETIREMENT HOME,True,True,2024,3,7,3,18
1,2024-03-07 14:10:00,059XX N GLENWOOD AVE,BATTERY,SIMPLE,SCHOOL - PUBLIC BUILDING,True,True,2024,3,7,3,14
2,2024-03-07 04:45:00,026XX W ARMITAGE AVE,BATTERY,AGGRAVATED - HANDGUN,STREET,True,True,2024,3,7,3,4
3,2024-03-07 19:00:00,007XX W 74TH ST,THEFT,OVER $500,STREET,True,True,2024,3,7,3,19
4,2024-03-07 00:08:00,022XX S TRUMBULL AVE,OTHER OFFENSE,FALSE / STOLEN / ALTERED TRP,STREET,True,True,2024,3,7,3,0


In [62]:
data_mod.tail()


Unnamed: 0,DATE OF OCCURRENCE,BLOCK,PRIMARY DESCRIPTION,SECONDARY DESCRIPTION,LOCATION DESCRIPTION,ARREST,DOMESTIC,Year,Month,Day,Weekday,Hour
5576,2023-09-17 18:24:00,123XX S LOWE AVE,WEAPONS VIOLATION,UNLAWFUL POSSESSION - HANDGUN,STREET,True,True,2023,9,17,6,18
5577,2024-05-02 17:00:00,051XX S HARPER AVE,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,2024,5,2,3,17
5578,2023-09-03 21:00:00,009XX N LAWNDALE AVE,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,True,True,2023,9,3,6,21
5579,2023-07-29 11:30:00,131XX S RHODES AVE,ASSAULT,SIMPLE,RESIDENCE,True,True,2023,7,29,5,11
5580,2023-08-27 02:28:00,073XX S HOYNE AVE,BATTERY,AGGRAVATED DOMEST,,True,True,2023,8,27,6,2
