In [1]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from urllib.request import urlretrieve
from IPython.core.display import display, HTML
%matplotlib inline

In [2]:
# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
def ensure_dir_exists(DIR_PATH):
    if not os.path.isdir(DIR_PATH):
        os.makedirs(DIR_PATH)

ROOT_DATA_DIR = os.path.join('..', 'data_raw')
ensure_dir_exists(ROOT_DATA_DIR)

In [5]:
DATA_DIR = os.path.join(ROOT_DATA_DIR, "compressed")
os.listdir(DATA_DIR)

['gz']

# Unzipping gz files

In [10]:
import gzip

In [14]:
gz_dir = os.path.join(ROOT_DATA_DIR, "compressed", "gz")
gz_file_names = os.listdir(gz_dir)
gz_file_names

['NYC_Open_Data_Film_Permits_sample5k.csv.gz']

In [16]:
gz_file_path = os.path.join(gz_dir, gz_file_names[0])
gz_file_path

'../data_raw/compressed/gz/NYC_Open_Data_Film_Permits_sample5k.csv.gz'

In [15]:
with gzip.open(gz_file_path, 'rb') as f_in:
    df = pd.read_csv(f_in)

In [13]:
df.head(2)

Unnamed: 0,EventID,EventType,StartDateTime,EndDateTime,EnteredOn,EventAgency,ParkingHeld,Borough,CommunityBoard(s),PolicePrecinct(s),Category,SubCategoryName,Country,ZipCode(s)
0,446040,Shooting Permit,10/19/2018 02:00:00 PM,10/20/2018 04:00:00 AM,10/16/2018 11:57:27 AM,"Mayor's Office of Film, Theatre & Broadcasting",THOMPSON STREET between PRINCE STREET and SPRI...,Manhattan,2,1,Television,Cable-episodic,United States of America,10012
1,446168,Shooting Permit,10/19/2018 02:00:00 PM,10/20/2018 02:00:00 AM,10/16/2018 07:03:56 PM,"Mayor's Office of Film, Theatre & Broadcasting",MARBLE HILL AVENUE between WEST 227 STREET an...,Manhattan,"12, 8","34, 50",Film,Feature,United States of America,"10034, 10463"


# Unzipping gz files to file

In [17]:
import shutil

In [24]:
gz_dir = os.path.join(ROOT_DATA_DIR, "compressed", "gz")
gz_file_names = os.listdir(gz_dir)
print(f"gz_file_names: \n    {gz_file_names}")
gz_file_name = gz_file_names[0]
print(f"gz_file_name: \n    {gz_file_name}")
gz_file_path = os.path.join(gz_dir, gz_file_name)
print(f"gz_file_path: \n    {gz_file_path}")
output_data_file_name = ".".join(gz_file_name.split(".")[:-1])
print(f"output_data_file_name: \n    {output_data_file_name}")
output_data_file_path = os.path.join(gz_dir, output_data_file_name)
print(f"output_data_file_path: \n    {output_data_file_path}")

gz_file_names: 
    ['NYC_Open_Data_Film_Permits_sample5k.csv.gz']
gz_file_name: 
    NYC_Open_Data_Film_Permits_sample5k.csv.gz
gz_file_path: 
    ../data_raw/compressed/gz/NYC_Open_Data_Film_Permits_sample5k.csv.gz
output_data_file_name: 
    NYC_Open_Data_Film_Permits_sample5k.csv
output_data_file_path: 
    ../data_raw/compressed/gz/NYC_Open_Data_Film_Permits_sample5k.csv


In [25]:
with gzip.open(gz_file_path, 'rb') as f_in:
    with open(output_data_file_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [26]:
os.listdir(gz_dir)

['NYC_Open_Data_Film_Permits_sample5k.csv',
 'NYC_Open_Data_Film_Permits_sample5k.csv.gz']