In [114]:
import pandas as pd
import numpy as np

## Data Cleaning

### Brief Summary: 
Due to the pre-cleaned nature of the data, the dataset utilized for this analysis consists of no duplicate observations as well as no missing values. Thus, the cleaning process only consists of selecting relevant rows and columns to conduct time series forecasting using both univariate and multivariate datasets.

### Step 1: Load the Data and remove irrelevant columns

In [126]:
#Load the data
data=pd.read_csv(r"C:\Users\hajar\Time-Series-Crime-Forecasting-Minneapolis-3\data\Crime_Data.csv")

#Remove irrelevant columns
cols_to_be_deleted = [
    "Case_Number", "Case_NumberAlt", "Occurred_Date",
    "NIBRS_Code","Problem_Initial", "Problem_Final","Address",
    "wgsXAnon","wgsYAnon", "X","Y","OBJECTID","Offense","NIBRS_Group"
]

data=data.drop(columns=cols_to_be_deleted)
data

Unnamed: 0,Type,Reported_Date,NIBRS_Crime_Against,Offense_Category,Precinct,Neighborhood,Ward,Latitude,Longitude,Crime_Count
0,Additional Crime Metrics,2019/01/05 02:50:00+00,Non NIBRS Data,Subset of NIBRS Assault Offenses,1.0,Elliot Park,6.0,44.96717,-93.26638,1
1,Additional Crime Metrics,2019/01/08 12:22:00+00,Non NIBRS Data,Subset of NIBRS Assault Offenses,4.0,Cleveland,4.0,45.01595,-93.31080,1
2,Additional Crime Metrics,2019/01/10 14:10:00+00,Non NIBRS Data,Subset of NIBRS Assault Offenses,4.0,Willard - Hay,5.0,44.99875,-93.30506,1
3,Additional Crime Metrics,2019/01/27 06:52:00+00,Non NIBRS Data,Subset of NIBRS Assault Offenses,4.0,Webber - Camden,4.0,45.03545,-93.30123,1
4,Additional Crime Metrics,2019/02/03 03:24:00+00,Non NIBRS Data,Subset of NIBRS Assault Offenses,2.0,Holland,1.0,45.01089,-93.24469,1
...,...,...,...,...,...,...,...,...,...,...
323390,Shots Fired Calls,2025/03/07 00:20:43+00,Non NIBRS Data,Shots Fired Calls,4.0,Willard - Hay,5.0,44.99687,-93.30444,1
323391,Shots Fired Calls,2025/03/07 14:24:33+00,Non NIBRS Data,Shots Fired Calls,4.0,Lind - Bohanon,4.0,45.04843,-93.29278,1
323392,Shots Fired Calls,2025/03/08 00:12:37+00,Non NIBRS Data,Shots Fired Calls,3.0,Seward,2.0,44.95918,-93.22761,1
323393,Shots Fired Calls,2025/03/09 19:44:43+00,Non NIBRS Data,Shots Fired Calls,4.0,Folwell,4.0,44.97179,-93.26176,1


### Step 2: Remove all rows that don't pretain to Crime Offenses (NIBRS) or are labled "Not a Crime"

In [127]:
data = data[data.Type =="Crime Offenses (NIBRS)"]
data = data[data.NIBRS_Crime_Against != "Not a Crime "]
#data.NIBRS_Crime_Against.value_counts()
data

Unnamed: 0,Type,Reported_Date,NIBRS_Crime_Against,Offense_Category,Precinct,Neighborhood,Ward,Latitude,Longitude,Crime_Count
15,Crime Offenses (NIBRS),2019/01/01 07:53:00+00,Person,Assault Offenses,5.0,Lowry Hill East,10.0,44.95643,-93.29186,1
16,Crime Offenses (NIBRS),2019/01/04 10:07:00+00,Property,Destruction/Damage/Vandalism of Property,1.0,North Loop,3.0,44.98658,-93.27071,1
17,Crime Offenses (NIBRS),2019/01/01 03:23:00+00,Property,Destruction/Damage/Vandalism of Property,2.0,Marshall Terrace,1.0,45.01407,-93.26314,1
33,Crime Offenses (NIBRS),2019/01/02 04:04:00+00,Property,Larceny/Theft Offenses,2.0,Como,1.0,44.98879,-93.22155,1
34,Crime Offenses (NIBRS),2019/01/04 05:27:00+00,Property,Stolen Property Offenses,4.0,Webber - Camden,4.0,45.02756,-93.28540,1
...,...,...,...,...,...,...,...,...,...,...
318591,Crime Offenses (NIBRS),2025/03/06 21:19:00+00,Property,Larceny/Theft Offenses,5.0,East Bde Maka Ska,10.0,44.93952,-93.30155,1
318592,Crime Offenses (NIBRS),2025/03/07 13:04:00+00,Property,Larceny/Theft Offenses,3.0,Hiawatha,12.0,44.91445,-93.21708,1
318593,Crime Offenses (NIBRS),2025/03/08 11:57:00+00,Property,Destruction/Damage/Vandalism of Property,5.0,Lyndale,8.0,44.94293,-93.28051,1
318594,Crime Offenses (NIBRS),2025/03/10 11:44:00+00,Property,Larceny/Theft Offenses,3.0,Regina,8.0,44.92152,-93.26829,1


### Step 3: Create a seperate Column for Time


In [128]:
#create two new columns for date and time
data[['Date','Time']]= data.Reported_Date.str.split(expand=True)

#remove type (since they all have the same type) and crime count since each observation has a crime count of 1
data= data.drop(columns=["Type","Crime_Count","Reported_Date"])
data

Unnamed: 0,NIBRS_Crime_Against,Offense_Category,Precinct,Neighborhood,Ward,Latitude,Longitude,Date,Time
15,Person,Assault Offenses,5.0,Lowry Hill East,10.0,44.95643,-93.29186,2019/01/01,07:53:00+00
16,Property,Destruction/Damage/Vandalism of Property,1.0,North Loop,3.0,44.98658,-93.27071,2019/01/04,10:07:00+00
17,Property,Destruction/Damage/Vandalism of Property,2.0,Marshall Terrace,1.0,45.01407,-93.26314,2019/01/01,03:23:00+00
33,Property,Larceny/Theft Offenses,2.0,Como,1.0,44.98879,-93.22155,2019/01/02,04:04:00+00
34,Property,Stolen Property Offenses,4.0,Webber - Camden,4.0,45.02756,-93.28540,2019/01/04,05:27:00+00
...,...,...,...,...,...,...,...,...,...
318591,Property,Larceny/Theft Offenses,5.0,East Bde Maka Ska,10.0,44.93952,-93.30155,2025/03/06,21:19:00+00
318592,Property,Larceny/Theft Offenses,3.0,Hiawatha,12.0,44.91445,-93.21708,2025/03/07,13:04:00+00
318593,Property,Destruction/Damage/Vandalism of Property,5.0,Lyndale,8.0,44.94293,-93.28051,2025/03/08,11:57:00+00
318594,Property,Larceny/Theft Offenses,3.0,Regina,8.0,44.92152,-93.26829,2025/03/10,11:44:00+00


### Step 5: create two cleaned data sets (univariate and multivariate)

In [132]:
univar_dat=data.drop(columns=["Time","Longitude","Latitude", "Ward", "Neighborhood","Precinct", "Offense_Category","NIBRS_Crime_Against"])
multvar_dat=data

### Step 6: Create two new CSV files for the cleaned datasets

In [None]:
#fix univariate data

#create column with each date
#create another column with counts of crimes for each date
#univar_dat['Crimes']= data.Date.value_counts()
#univar_dat.Date.drop_duplicates()
#univar_dat

univar_dat.to_csv(r"C:\Users\hajar\Time-Series-Crime-Forecasting-Minneapolis-3\data\Univ_Data.csv", index=False)
multvar_dat.to_csv(r"C:\Users\hajar\Time-Series-Crime-Forecasting-Minneapolis-3\data\Multiv_Data.csv", index=False)

Unnamed: 0,Date,Crimes
15,2019/01/01,
16,2019/01/04,
17,2019/01/01,
33,2019/01/02,
34,2019/01/04,
...,...,...
318591,2025/03/06,
318592,2025/03/07,
318593,2025/03/08,
318594,2025/03/10,


### Step 7: Visualize the first couple rows of the cleaned data

In [124]:
multvar_dat

Unnamed: 0,Reported_Date,NIBRS_Crime_Against,Offense_Category,Precinct,Neighborhood,Ward,Latitude,Longitude,Date,Time
15,2019/01/01 07:53:00+00,Person,Assault Offenses,5.0,Lowry Hill East,10.0,44.95643,-93.29186,2019/01/01,07:53:00+00
16,2019/01/04 10:07:00+00,Property,Destruction/Damage/Vandalism of Property,1.0,North Loop,3.0,44.98658,-93.27071,2019/01/04,10:07:00+00
17,2019/01/01 03:23:00+00,Property,Destruction/Damage/Vandalism of Property,2.0,Marshall Terrace,1.0,45.01407,-93.26314,2019/01/01,03:23:00+00
33,2019/01/02 04:04:00+00,Property,Larceny/Theft Offenses,2.0,Como,1.0,44.98879,-93.22155,2019/01/02,04:04:00+00
34,2019/01/04 05:27:00+00,Property,Stolen Property Offenses,4.0,Webber - Camden,4.0,45.02756,-93.28540,2019/01/04,05:27:00+00
...,...,...,...,...,...,...,...,...,...,...
318591,2025/03/06 21:19:00+00,Property,Larceny/Theft Offenses,5.0,East Bde Maka Ska,10.0,44.93952,-93.30155,2025/03/06,21:19:00+00
318592,2025/03/07 13:04:00+00,Property,Larceny/Theft Offenses,3.0,Hiawatha,12.0,44.91445,-93.21708,2025/03/07,13:04:00+00
318593,2025/03/08 11:57:00+00,Property,Destruction/Damage/Vandalism of Property,5.0,Lyndale,8.0,44.94293,-93.28051,2025/03/08,11:57:00+00
318594,2025/03/10 11:44:00+00,Property,Larceny/Theft Offenses,3.0,Regina,8.0,44.92152,-93.26829,2025/03/10,11:44:00+00
