In [1]:
import re
from glob import glob
from docx import Document

In [2]:
# Identify our crime reports
files = glob('crime_reports/*.docx')
len(files)

84

In [9]:
# function for reading text files, using the docx library
#  https://python-docx.readthedocs.io/en/latest/
def get_text(file):
    document = Document(file)
    return '\n'.join([p.text for p in document.paragraphs])

In [10]:
text = get_text(files[0])

In [12]:
# Patterns created at regex101
coord_pattern = re.compile(r'\(-?[1-9]{1,2}\.[0-9]{1,10}, -?[1-9]{1,2}\.[0-9]{1,10}\)')
date_pattern = re.compile(r'[0-9]{1,2}\/[0-9]{1,2}\/2016')
time_pattern = re.compile(r'[0-9]{1,2}:[0-9]{2} (AM|PM)')

In [28]:
re.search(time_pattern, text).group()

'7:24 AM'

In [29]:
# Apply our patterns to each file, and create one dictionary of results per file
results = []

for file in files:
    text = get_text(file)
    
    coord = coord_pattern.findall(text)[0]
    lat, lon = coord[1:-1].split(', ')

    date = date_pattern.findall(text)[0]

    time = re.search(time_pattern, text)
    time = '' if time is None else time.group()
    
    row_dict = {
        "lat": float(lat),
        "lon": float(lon),
        "dtg": date + " " + time,
        "text": text
    }
    
    results.append(row_dict)

In [33]:
import pandas as pd
import geopandas as gpd

In [34]:
# Convert our results into a geodataframe, and write to file
df = pd.DataFrame(results)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon, df.lat), crs=4326)

gdf.to_file("CrimeReports.shp")