In [None]:
# Standard Import Stuff

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score, silhouette_score)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cluster import KMeans

from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/Data Files/cleaned_rq1.csv"

rq1_df = pd.read_csv(path)
print(rq1_df.shape)
print(rq1_df.describe())


In [None]:
# There are negative values for resolution time. Not possible, potential datetime errors from pulled dataset.
# Will set all values < 0 to = 0, assuming same day resolution.
# Additionally, with a mean of 11.97 for resolution_days and a stdev of 61.08, that is heavily skewed by high outliers
# Due to the heavily-right skewing, will perform log transform to lessen the influence of outliers.

rq1_df.loc[rq1_df['resolution_days'] < 0, 'resolution_days'] = 0

rq1_df.describe(include='all')

In [None]:
# Assumption is that the post-pandemic data will differ greatly and thus skew the train/test split.
# To remedy this we will split the dataframe into our two timelines suggested from EDA: 2016-2019 and 2020-2023.

rq1_df['occurred_datetime'] = pd.to_datetime(rq1_df['occurred_datetime'])
rq1_df['report_datetime'] = pd.to_datetime(rq1_df['report_datetime'])

rq1_time1_df = rq1_df[(rq1_df['occurred_datetime'].dt.year >= 2016) & (rq1_df['occurred_datetime'].dt.year <= 2019)]
rq1_time2_df = rq1_df[(rq1_df['occurred_datetime'].dt.year >= 2020) & (rq1_df['occurred_datetime'].dt.year <= 2023)]

print("2016-2019 Shape: ", rq1_time1_df.shape)
print("2020-2023 Shape: ", rq1_time2_df.shape)

In [None]:
# Now to prepare for modeling we have to split the data for training and testing
# Because we are planning to compare the performance of multiple models, we will use a 70/30 split
# We are operating off of 2 different dataframes now, 3 if we include the original which should remain unchanged.
# Features to choose from are 'offense', 'family_violence', 'location_type', 'tract_geoid'
# Target variable is 'resolution_days'
# Random state is Pi, everyone loves Pi.

X1 = pd.get_dummies(rq1_time1_df[['offense', 'location_type', 'tract_geoid']], drop_first=True)
y1 = np.log1p(rq1_time1_df['resolution_days'])


X2 = rq1_time2_df[['offense']]
y2 = np.log1p(rq1_time2_df['resolution_days'])

X2_encoded = pd.get_dummies(X2, columns=['offense'] )

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=314)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_encoded, y2, test_size=0.3, random_state=314)