In [1]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from urllib.request import urlretrieve
from IPython.core.display import display, HTML
%matplotlib inline

In [2]:
# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
def ensure_dir_exists(DIR_PATH):
    if not os.path.isdir(DIR_PATH):
        os.makedirs(DIR_PATH)

DATA_DIR = os.path.join('..', 'data_raw')
ensure_dir_exists(DATA_DIR)

# Examining files in a zipped archive

In [4]:
from zipfile import ZipFile
import shutil

def download_city_TheTrace_data(data_url, DEBUG = False, DATA_DIR=DATA_DIR):
    file_name = data_url.split('/')[-1]
    dir_path = os.path.join(DATA_DIR)
    file_path = os.path.join(dir_path, file_name)
    if DEBUG: print(f"dir_path:  {dir_path}")
    if DEBUG: print(f"file_path: {file_path}")
    ensure_dir_exists(dir_path)
    if not os.path.isfile(file_path):
        urlretrieve(data_url, file_path)
        print(f"{file_name} successfully downloaded")
    city_data_dir = os.path.join(dir_path, file_name.replace('.zip', ''))
    if os.path.isfile(file_path) and not os.path.isdir(city_data_dir):
        with ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(dir_path)
        macosx_dir_path = os.path.join(dir_path, '__MACOSX')
        if os.path.isdir(macosx_dir_path):
            shutil.rmtree(macosx_dir_path)
        print(f"{file_name} successfully extracted")
    else:
        print('Data already downloaded and extracted')

In [5]:
download_city_TheTrace_data(data_url = 'https://freetoshootagain.s3.amazonaws.com/Chicago_IL.zip')

Data already downloaded and extracted


In [6]:
os.listdir(DATA_DIR)

['Chicago_IL.zip', 'Chicago_IL']

So, I've already extracted the zip file in the download_city_TheTrace_data() function, but if you wanted to explore the files while still-zipped, do this.

In [7]:
ZIP_PATH = os.path.join(DATA_DIR, 'Chicago_IL.zip')

with ZipFile(ZIP_PATH, 'r') as zip_ref:
    zipped_file_names = zip_ref.namelist()
    for file_name in zipped_file_names:
        print(file_name)

Chicago_IL/
Chicago_IL/Case status definitions.pdf
__MACOSX/
__MACOSX/Chicago_IL/
__MACOSX/Chicago_IL/._Case status definitions.pdf
Chicago_IL/READ_ME.xlsx
__MACOSX/Chicago_IL/._READ_ME.xlsx
Chicago_IL/14370_P519598_Ryley_Homicides_and_Shootings_original_Jan_2001_thru_Aug_31_2019.xlsx
__MACOSX/Chicago_IL/._14370_P519598_Ryley_Homicides_and_Shootings_original_Jan_2001_thru_Aug_31_2019.xlsx


In [8]:
excel_file_name = [fn for fn in zipped_file_names if 'Homicides_and_Shootings' in fn and 'MACOSX' not in fn][0]
excel_file_name

'Chicago_IL/14370_P519598_Ryley_Homicides_and_Shootings_original_Jan_2001_thru_Aug_31_2019.xlsx'

In [9]:
with ZipFile(ZIP_PATH, 'r') as zip_ref:
    with zip_ref.open(excel_file_name) as my_file_path:
        excel_file = pd.ExcelFile(my_file_path)
        file_sheet_names = excel_file.sheet_names

In [10]:
file_sheet_names

['Shooting Incidents', 'Homicides']

In [11]:
shooting_df = excel_file.parse(sheet_name='Shooting Incidents')
shooting_df.head(2)

Unnamed: 0,RD,ID,Date,IUCR,IUCR Primary Description,Weapon Used,Status,Address,City,Beat,District
0,HS100027,7296966.0,2010-01-01,041A,BATTERY,FIREARM,1-SUSPENDED,78XX S WOOD ST,CHICAGO,611.0,6.0
1,HS100196,7297039.0,2010-01-01,041A,BATTERY,FIREARM,1-SUSPENDED,22XX W ARTHUR AVE,CHICAGO,2412.0,24.0


In [12]:
homicide_df = excel_file.parse(sheet_name='Homicides')
homicide_df.head(2)

Unnamed: 0,RD,ID,Date,IUCR,IUCR Primary Description,Injury Type,Cleared?2,Date Cleared,Address,City,Beat,District,Death Cause Category,Victim Name,Victim Sex,Victim Age,Victim Race,Victim Rel to Offender
0,G057558,671.0,2001-01-01,110.0,HOMICIDE,STABBED,Y,2001-02-26,76XX S MAY ST,CHICAGO,612.0,6.0,ALTERCATION,QUANSON BUCKLEY,M,28.0,BLACK,SOME ACQUAINTANCE
1,G000705,634.0,2001-01-01,110.0,HOMICIDE,SHOT,N,NaT,24XX W MONROE ST,CHICAGO,1125.0,11.0,ALTERCATION,MARKITA RICHARDSON,F,24.0,BLACK,NOT ESTABLISHED
