In [1]:
# Run to stitch the active dispatch tables together.  
# input: ../data/active_dispatch/[time code].csv  
# output: ../data/active_dispatch.csv  

In [3]:
import pandas as pd
import os
import re

In [5]:
# step 1 - merge!

# step 1a - loop through the files, merging them

In [7]:
ad = pd.DataFrame(columns = ['incident_type_code','incident_type_name','call_received_time','location',
                             'location_description','city_name','last_updated'])

In [9]:
folder = '../data/active_dispatch/'
files = [f for f in os.listdir(folder)]

print('starting...')

# processing file X of Y: filename.csv

x = 1
y = len(files)

for file in files :
    print(f"\rprocessing file {x} of {y}: {file}                  ", end="")
    next_df = pd.read_csv(folder + str(file))
    ad = pd.concat([ad, next_df])
    x += 1

print()
print('done!')

starting...
processing file 2088 of 2088: 1744387021000.csv                  done!


In [10]:
print(len(ad)) # how many records?

26184


In [11]:
# make sure I know which columns have nulls...
nulls_per_column = ad.isnull().sum()
print(nulls_per_column)

incident_type_code          0
incident_type_name          0
call_received_time          0
location                    0
location_description    26184
city_name                   0
last_updated                0
dtype: int64


In [12]:
# step 1b - group by to get rid of the duplicates, keeping only the latest last_updated record
# Group By [incident_type_code], [incident_type_name], etc... EVERYTHING EXCEPT [last_updated]... use the MAX([last_updated])

In [13]:
#grouped_sum = df.groupby(['Category', 'Subcategory'])['Value'].sum()
#ad.groupby(['incident_type_code','incident_type_name','call_received_time','location',
#                             'location_description','city_name'])['last_updated'].max()
ad = ad.groupby(['incident_type_code','incident_type_name','call_received_time',
                 'location','city_name'])['last_updated'].max().reset_index()

In [14]:
ad = ad.sort_values(by = 'call_received_time').reset_index()

In [15]:
ad

Unnamed: 0,index,incident_type_code,incident_type_name,call_received_time,location,city_name,last_updated
0,514,70A,RESIDENCE-BURGLARY ALARM,1743085667000,719 MYRTLE ST,EAST,1743103621000
1,1145,71A,NON-RESIDENCE-BURGLARY ALARM,1743085918000,3188 DICKERSON PIKE,EAST,1743103621000
2,2260,83P,SHOTS FIRED,1743093178000,1433 PENNOCK AVE,EAST,1743115680000
3,2223,71P,BURGLARY-NON-RESIDENCE BREAK-IN,1743094882000,360 WALLACE RD,PARAGON MILLS,1743110280000
4,515,70A,RESIDENCE-BURGLARY ALARM,1743095110000,1236 BRENTWOOD HIGHLANDS DR,NIPPERS CORNER,1743103621000
...,...,...,...,...,...,...,...
3012,2221,71A,NON-RESIDENCE-BURGLARY ALARM,1744384055000,2606 EUGENIA AVE,WOODBINE,1744387021000
3013,2222,71A,NON-RESIDENCE-BURGLARY ALARM,1744384891000,3106 BELMONT BLVD,BELMONT,1744385220000
3014,129,53A,ROBERRY/HOLD UP ALARM,1744385291000,800 FORT NEGLEY BLVD,FORT NEGLEY,1744386481000
3015,988,70A,RESIDENCE-BURGLARY ALARM,1744385848000,2927 GLENMEADE DR,INGLEWOOD,1744387021000


### step 2 - calculate and format!

In [28]:
calculated_ad = pd.DataFrame(columns = ['tencode',   # regex to only pull the number part
                                        'incident_type_name',
                                        'call_received_date_time', # convert from UNIX time stamp
                                        'cr_date',
                                        'cr_time',  #			5:45 PM
                                        'cr_display_month', # Jan, Feb, Mar, etc
                                        'cr_month_num', # 1, 2, 3
                                        'cr_year',
                                        'cr_weekday', # Sun, Mon, Tues, Wed
                                        'cr_weekday_num', # 1, 2, 3, 4
                                        'cr_display_hour', #		0-23
                                        'cr_hour', #			1-12
                                        'cr_min',
                                        'cr_ampm',
                                        'address', # -> location + ", NASHVILLE, TN"
                                        #	(Estimated Resolution Time: ert)
                                        #	add 240,000 to the UNIX time stamp of the last_updated time (4 minutes * 60 seconds * 1000 miliseconds)
                                        'ert_date',
                                        'ert_time',
                                        'ert_display_month',
                                        'ert_month_num',
                                        'ert_year',
                                        'ert_weekday',
                                        'ert_weekday_num',
                                        'ert_display_hour',
                                        'ert_hour',
                                        'ert_min',
                                        'ert_ampm'
                                       ])

In [32]:
# processing row X (index) of Y: row[call_received_time]

x = 1
y = len(ad)

for index, row in ad.iterrows() :
    print(f"\rprocessing row {x} (index {index}) of {y}: call received: {row['call_received_time']}          ", end="")
    code = row['incident_type_code']
    match = re.search(r'^(\d+)', code)    # search for a digit or multiple, followed by a space, followed by any number of any characters
    if match :
        code = match.group(1)
    else :
        print('Missed a regex match in a row... look into this and try again')
    
    new_row = {'tencode': code,
               'incident_type_name': row['incident_type_name']}
    calculated_ad.loc[len(calculated_ad)] = new_row
    x += 1

print()
print('done!')

processing row 3017 (index 3016) of 3017: call received: 1744386630000          
done!


In [None]:
# Continue from here!  Process the entirety of the row!

In [36]:
# look for nulls again...
nulls_per_column = calculated_ad.isnull().sum()
print(nulls_per_column)

tencode                       0
incident_type_name            0
call_received_date_time    6034
cr_date                    6034
cr_time                    6034
cr_display_month           6034
cr_month_num               6034
cr_year                    6034
cr_weekday                 6034
cr_weekday_num             6034
cr_display_hour            6034
cr_hour                    6034
cr_min                     6034
cr_ampm                    6034
address                    6034
ert_date                   6034
ert_time                   6034
ert_display_month          6034
ert_month_num              6034
ert_year                   6034
ert_weekday                6034
ert_weekday_num            6034
ert_display_hour           6034
ert_hour                   6034
ert_min                    6034
ert_ampm                   6034
dtype: int64


In [34]:
calculated_ad

Unnamed: 0,tencode,incident_type_name,call_received_date_time,cr_date,cr_time,cr_display_month,cr_month_num,cr_year,cr_weekday,cr_weekday_num,...,ert_time,ert_display_month,ert_month_num,ert_year,ert_weekday,ert_weekday_num,ert_display_hour,ert_hour,ert_min,ert_ampm
0,70,RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
1,71,NON-RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
2,83,SHOTS FIRED,,,,,,,,,...,,,,,,,,,,
3,71,BURGLARY-NON-RESIDENCE BREAK-IN,,,,,,,,,...,,,,,,,,,,
4,70,RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6029,71,NON-RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
6030,71,NON-RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
6031,53,ROBERRY/HOLD UP ALARM,,,,,,,,,...,,,,,,,,,,
6032,70,RESIDENCE-BURGLARY ALARM,,,,,,,,,...,,,,,,,,,,
