Project: Pre-process crime narratives from NYPD: crimes committed in parks
 - add variables useful for categorization and analysis
 - precints and districts are Brooklyn-specific
Created By: Jasmine Soltani
Last modified: 3/10/2021

In [None]:
pip install PyPDF2

In [2]:
pip install textract

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from pandas import DataFrame

In [5]:
import PyPDF2

In [6]:
import textract

In [7]:
import csv

In [8]:
import datetime
import time

In [23]:
def narratives_to_data(text):
     ##grab data for export
    narratives = []  #create empty list to add dictionary list of all narratives
    totaln = text.count("61#") #all narratives begin with 61#
    n = 0 #counter
    i = 0 #text index

    #loop through all narratives:
    while n < totaln:
        #find indices
        start = text.find("61#", i)
        i = start + 1
        end = text.find("61#", i)
        precinct_start = text.find("Pct:", i)
        precinct_end = text.find("Sect:", i)
        status_start = text.find("Status:", i)
        domestic_start = text.find("Domestic:", i)
        arrest_start = text.find("Ttl Arrst:", i)
        arrest_end = text.find("Juris:", i)
        rpt_dttm_start = text.find("Report Date/Time:", i)
        fr_dttm_start = text.find("From Date/Time:", i)
        to_dttm_start = text.find("To Date/Time:", i)
        #BK South sometimes sends victim information
        to_dttm_end = text.find("Victim Name:", i)
        victim_sra_start = text.find("Victim S/R/A:", i)
        victim_sra_end = text.find("Relationship:", i)
        ##
        crime_start = text.find("Crime:", i)
        loc_addr_start = text.find("Location:", i)
        loc_type_start = text.find("Location Type:", i)
        parks_nm_start = text.find("Premise Name:", i)
        weapon_start = text.find("Weapon:", i)
        #BK South includes PD Code
        pd_code = text.find("PD Code:", i)
        prop_value = text.find("Prop. Value:", i)

        #add char length of match text
        cmplnt_num = text[start+3:precinct_start].strip()
        precinct = text[precinct_start+4:precinct_end].strip()
        status = text[status_start+7:domestic_start].strip()
        domestic = text[domestic_start+9:arrest_start].strip()
        arrest = text[arrest_start+10:arrest_end].strip()
        rpt_dttm = text[rpt_dttm_start+17:fr_dttm_start].replace('\n','').strip(' hrs')
        from_dttm = text[fr_dttm_start+15:to_dttm_start].replace('\n','').strip(' hrs')
        #BK North vs. BK South:
        if (to_dttm_end != -1): #if victim name is found (BK North)
            to_dttm = text[to_dttm_start+13:to_dttm_end].replace('\n','').strip(' hrs')
            victim_sra = text[victim_sra_start+13:victim_sra_end].strip().split("/") #returns a list
            no_victim = False
        else: #otherwise if not found (BK South)
            to_dttm = text[to_dttm_start+13:crime_start].replace('\n','').strip(' hrs')
            no_victim = True #no victim data boolean
        #weapon
        if pd_code < prop_value:
            weapon = text[weapon_start+7:pd_code].strip()
        else:
            weapon = text[weapon_start+7:prop_value].strip()
        crime = text[crime_start+6:loc_addr_start].strip()
        loc_addr = text[loc_addr_start+9:loc_type_start].strip()
        loc_type = text[loc_type_start+14:parks_nm_start].strip()
        parks_nm = text[parks_nm_start+13:weapon_start].strip()
        
        #separate victim sra into distinct variables
        if no_victim:
            victim_sex = ''
            victim_race = ''
            victim_age = ''
        elif not victim_sra[0]:
            victim_sex = ''
            victim_race = ''
            victim_age = ''
        else:
            victim_sex = victim_sra[0]
            victim_race = victim_sra[1]
            victim_age = victim_sra[2]
        
        # format example: 9/20/2020 21:22 hrs
        rpt_obj = datetime.datetime.strptime(rpt_dttm, '%m/%d/%Y %H:%M') 
        rpt_dt = rpt_obj.date()
        rpt_tm = rpt_obj.time()
        rpt_mo = rpt_obj.month
        
        from_obj = datetime.datetime.strptime(from_dttm, '%m/%d/%Y %H:%M') 
        from_dt = from_obj.date()
        from_tm = from_obj.time()
        from_mo = from_obj.month
        
        #time of day
        if from_tm < datetime.time(12, 0, 0):
            from_timeOfDay = "Morning"
        elif from_tm < datetime.time(16, 0, 0):
            from_timeOfDay = "Afternoon"
        elif from_tm < datetime.time(19, 0, 0):
            from_timeOfDay = "Evening"
        elif from_tm < datetime.time(23, 59, 59):
            from_timeOfDay = "Night"
                   
        if to_dttm:
            try: 
                datetime.datetime.strptime(to_dttm, '%m/%d/%Y %H:%M')
                to_obj = datetime.datetime.strptime(to_dttm, '%m/%d/%Y %H:%M') 
                to_dt = to_obj.date()
                to_tm = to_obj.time()
            except: 
                print("Other to_dttm format")
        else:
            to_dt = None
            to_tm = None

        assault = ["assault", "harassment", "menacing", "strangulation"]
        larceny = ["jostling", "larceny"]
        nonpublic = ["lost property", "unclassified complaint", "deadly weapons", "d.o.a.", "gun shot detection", "murder, unclassified", "unidentified person"]
        mischief = ["mischief", "criminal contempt"]
        drugs = ["controlled substance", "alcohol"]
        weapons_possession = ["weapons"]
        missing_person = ["missing person"]
        sex = ["sexual", "rape", "lewdness", "obscene material", "sodomy", "endangering welfare"]
        murder = ["manslaughter", "murder"]

        #crime categories
        if any(x in crime.lower() for x in assault):
            crime_cat = "Assault"
        elif any(x in crime.lower() for x in larceny):
            crime_cat = "Larceny"
        elif any(x in crime.lower() for x in nonpublic): #crimes excluded from nyc open data complaints
            crime_cat = "Not Public"
        elif any(x in crime.lower() for x in mischief):
            crime_cat = "Criminal mischief/contempt"
        elif any(x in crime.lower() for x in weapons_possession):
            crime_cat = "Weapons possession"
        elif any(x in crime.lower() for x in missing_person):
            crime_cat = "Missing Person"
        elif any(x in crime.lower() for x in drugs):
            crime_cat = "Drugs/Alcohol"
        elif any(x in crime.lower() for x in sex):
            crime_cat = "Rape/Sex crimes"
        elif "murder" in crime.lower():
            crime_cat = "Murder"
        elif "robbery" in crime.lower():
            crime_cat = "Robbery"
        elif "arson" in crime.lower():
            crime_cat = "Arson"
        elif "gambling" in crime.lower():
            crime_cat = "Gambling"
        elif "trespass" in crime.lower():
            crime_cat = "Trespassing"
        elif "reckless endangerment" in crime.lower():
            crime_cat = "Reckless Endangerment"
        else:
            crime_cat = "Other"
        
        #BK precinct -> district
        if precinct == "090" or precinct == "094":
            district = "01"
        elif precinct == "084" or precinct == "088":
            district = "02"
        elif precinct == "079" or precinct == "081":
            district = "03"
        elif precinct == "083":
            district = "04"
        elif precinct == "075":
            district = "05"
        elif precinct == "076" or precinct == "078":
            district = "06"
        elif precinct == "072":
            district = "07"
        elif precinct == "077":
            district = "08"
        elif precinct == "071":
            district = "09"
        elif precinct == "068":
            district = "10"
        elif precinct == "062":
            district = "11"
        elif precinct == "066":
            district = "12"
        elif precinct == "060":
            district = "13"
        elif precinct == "070":
            district = "14"
        elif precinct == "061":
            district = "15"
        elif precinct == "073":
            district = "16"
        elif precinct == "067":
            district = "17"
        elif precinct == "063" or precinct == "069":
            district = "18"
        elif precinct == "078":
            district = "19"
        else:
            district = " "
        
        #create dictionary
        newline = {"cmplnt_num": cmplnt_num, 
                   "precinct": precinct, 
                   "district": district,
                   "status": status, 
                   "domestic": domestic, 
                   "arrest": arrest, 
                   "rpt_dt": rpt_dt, 
                   "rpt_tm": rpt_tm, 
                   "rpt_mo": rpt_mo,
                   "from_dt": from_dt, 
                   "from_tm": from_tm,
                   "rpt_mo": from_mo,
                   "to_dt": to_dt,
                   "to_tm": to_tm,
                   "victim_sex": victim_sex,
                   "victim_race": victim_race,
                   "victim_age": victim_age,
                   "crime": crime,
                   "crime_cat": crime_cat,
                   "loc_addr": loc_addr,
                   "loc_type": loc_type,
                   "parks_nm": parks_nm,
                   "weapon": weapon}

        #add to list
        narratives.append(newline)

        #increment narrative count
        n+=1
        
    return narratives

In [24]:
##requires pdfReader2, textract, pandas Dataframe & csv
def crimeExtract(infil, outfil):
    #get file
    pdfFileObj = open(infil, "rb")
    
    #create reader object
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    #initialize variables for looping through pages
    numPages = pdfReader.numPages
    count=0
    pdfText = ""
    
    #extract all text and store as text variable
    while count < numPages:
        pageObj = pdfReader.getPage(count)
        count+=1
        pdfText+= pageObj.extractText()
        
    #convert text to list of dictionaries for export    
    data = narratives_to_data(pdfText)
    #convert list of dictionaries to pandas dataframe
    df = pd.DataFrame(data)
    #flag duplicates
    df["duplicate"] = df.duplicated(subset=["precinct", "status", "domestic", "arrest", "rpt_dt", "rpt_tm", "from_dt", "from_tm", "victim_sex", "victim_race", "victim_age", "crime", "loc_addr", "loc_type", "parks_nm", "weapon"], keep=False)
    
    #export data
    df.to_csv(outfil, index=False)
        

In [20]:
crimeExtract("J:/OPERATIONS/Analyst - Operations Folder/shared/Crime/Complaint Narratives/South BK/2020.09.20-2021.01.27_SBK.pdf", 
             "J:/OPERATIONS/Analyst - Operations Folder/shared/Crime/Complaint Narratives/data_extracts/SBK_2020.09.20-2021.01.27.csv")