# Kaggle dataset extractor and organizer

In [8]:
import os

os.environ['KAGGLE_CONFIG_DIR'] = 'C:/Users/afons/.kaggle/'

import zipfile
import pandas as pd
import kaggle

### Remove feather files, if they exist

In [9]:
data_folder = 'data'

# List all files in the data folder
files = os.listdir(data_folder)

# Remove Feather files (files with a ".feather" extension)
for file in files:
    if file.endswith(".feather"):
        file_path = os.path.join(data_folder, file)
        os.remove(file_path)
        print(f"Removed {file_path}")

print("Feather files removed from the 'data' folder.")

Removed data\Crime_Data_from_2010_to_2019_0.feather
Removed data\Crime_Data_from_2010_to_2019_1.feather
Removed data\Crime_Data_from_2010_to_2019_2.feather
Removed data\Crime_Data_from_2010_to_2019_3.feather
Removed data\Crime_Data_from_2010_to_2019_4.feather
Removed data\Crime_Data_from_2010_to_2019_5.feather
Removed data\Crime_Data_from_2010_to_2019_6.feather
Removed data\Crime_Data_from_2010_to_2019_7.feather
Removed data\Crime_Data_from_2010_to_2019_8.feather
Removed data\Crime_Data_from_2010_to_2019_9.feather
Removed data\Crime_Data_from_2020_to_Present.feather
Feather files removed from the 'data' folder.


## Download the kaggle dataset

In [10]:
# set the path to the dataset on Kaggle
dataset_path = 'sumaiaparveenshupti/los-angeles-crime-data-20102020'

# download the dataset to the data folder
kaggle.api.dataset_download_files(dataset_path, path='data')


## Extract the files from zip, get into feather files and remove csv and zip file

In [11]:

# set the path to the zip file
zip_path = 'data/los-angeles-crime-data-20102020.zip'

# extract all csv files from the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('data')

In [12]:
import csv
import os

# Create the "data" directory if it doesn't exist
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

# Specify the input CSV file and the number of partitions
input_file = 'data/Crime_Data_from_2010_to_2019.csv'
num_partitions = 10  # Change this to the desired number of partitions

# Initialize a list of output CSV writers and files
output_writers = []
output_files = []

# Open the input file
with open(input_file, 'r') as input_csv:
    # Create a CSV reader for the input file
    csv_reader = csv.reader(input_csv)

    # Read the CSV header
    header = next(csv_reader)

    # Open output CSV files and writers for each partition in the "data" directory
    for i in range(num_partitions):
        partition_file = os.path.join(data_dir, f'Crime_Data_from_2010_to_2019_{i}.csv')
        output_file = open(partition_file, 'w', newline='')
        output_files.append(output_file)
        output_writers.append(csv.writer(output_file))

        # Write the header to each output file
        output_writers[i].writerow(header)

    # Iterate through the input CSV and distribute rows to output partitions
    current_partition = 0
    for row in csv_reader:
        output_writers[current_partition].writerow(row)
        current_partition = (current_partition + 1) % num_partitions  # Cycle through partitions

# Close all output CSV files
for output_file in output_files:
    output_file.close()

print(f'Partitioned into {num_partitions} smaller CSV files in the "data" folder.')


Partitioned into 10 smaller CSV files in the "data" folder.


In [13]:
# iterate over all csv files in the data folder
for file_name in os.listdir('data'):
    if file_name.endswith('.csv'):
        print(f'Processing {file_name}...')
        # read the csv file into a pandas dataframe
        df = pd.read_csv(os.path.join('data', file_name))
        
        # transform the dataframe into more space-efficient datatypes
        for col in df.columns:
            if df[col].dtype == 'int64':
                df[col] = pd.to_numeric(df[col], downcast='integer')
            elif df[col].dtype == 'float64':
                df[col] = pd.to_numeric(df[col], downcast='float')
        
        # save the transformed dataframe as a feather file
        feather_path = os.path.join('data', os.path.splitext(file_name)[0] + '.feather')
        df.to_feather(feather_path)

Processing Crime_Data_from_2010_to_2019.csv...
Processing Crime_Data_from_2010_to_2019_0.csv...
Processing Crime_Data_from_2010_to_2019_1.csv...
Processing Crime_Data_from_2010_to_2019_2.csv...
Processing Crime_Data_from_2010_to_2019_3.csv...
Processing Crime_Data_from_2010_to_2019_4.csv...
Processing Crime_Data_from_2010_to_2019_5.csv...
Processing Crime_Data_from_2010_to_2019_6.csv...
Processing Crime_Data_from_2010_to_2019_7.csv...
Processing Crime_Data_from_2010_to_2019_8.csv...
Processing Crime_Data_from_2010_to_2019_9.csv...
Processing Crime_Data_from_2020_to_Present.csv...


In [14]:
# delete the zip file and the csv files
os.remove('data/los-angeles-crime-data-20102020.zip')
for file_name in os.listdir('data'):
    if file_name.endswith('.csv'):
        os.remove(os.path.join('data', file_name))
os.remove('data/Crime_Data_from_2010_to_2019.feather')

### Folder size

In [15]:
def get_folder_size(path):
    total_size = 0

    # Walk through the directory tree and add up the sizes of all files and subdirectories
    for dirpath, dirnames, filenames in os.walk(path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            total_size += os.path.getsize(file_path)

    return total_size

# Specify the path to the folder you want to measure
folder_path = 'data'

# Get the folder size in bytes
size_in_bytes = get_folder_size(folder_path)

# Convert the size to a more human-readable format (e.g., MB, GB)
def convert_bytes_to_readable(size_in_bytes):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_in_bytes < 1024.0:
            break
        size_in_bytes /= 1024.0
    return f"{size_in_bytes:.2f} {unit}"

folder_size_readable = convert_bytes_to_readable(size_in_bytes)

print(f"The folder size is: {folder_size_readable}")

The folder size is: 247.90 MB


## Spread the feathers into dataframes, after merging

In [16]:
cd1 = pd.read_feather('data/Crime_Data_from_2010_to_2019_0.feather')
cd2 = pd.read_feather('data/Crime_Data_from_2010_to_2019_1.feather')
cd3 = pd.read_feather('data/Crime_Data_from_2010_to_2019_2.feather')
cd4 = pd.read_feather('data/Crime_Data_from_2010_to_2019_3.feather')
cd5 = pd.read_feather('data/Crime_Data_from_2010_to_2019_4.feather')
cd6 = pd.read_feather('data/Crime_Data_from_2010_to_2019_5.feather')
cd7 = pd.read_feather('data/Crime_Data_from_2010_to_2019_6.feather')
cd8 = pd.read_feather('data/Crime_Data_from_2010_to_2019_7.feather')
cd9 = pd.read_feather('data/Crime_Data_from_2010_to_2019_8.feather')
cd10 = pd.read_feather('data/Crime_Data_from_2010_to_2019_9.feather')
cd11 = pd.read_feather('data/Crime_Data_from_2020_to_Present.feather')

cd = pd.concat([cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8, cd9, cd10, cd11], axis=0)

cd.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA ', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON', 'AREA'],
      dtype='object')

In [17]:
cdc = cd[['DR_NO','Crm Cd', 'Crm Cd Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'Status', 'Status Desc']]
cdd = cd[['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC']]
cda = cd[['DR_NO', 'AREA', 'AREA NAME', 'Rpt Dist No', 'LOCATION', 'Cross Street', 'LAT', 'LON']]
cdv = cd[['DR_NO', 'Vict Age', 'Vict Sex', 'Vict Descent']]
cds = cd[['DR_NO', 'Premis Cd', 'Premis Desc', 'Mocodes', 'Weapon Used Cd', 'Weapon Desc']]

cdc.head()

Unnamed: 0,DR_NO,Crm Cd,Crm Cd Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,Status,Status Desc
0,1307355,900,VIOLATION OF COURT ORDER,900.0,,,,AA,Adult Arrest
1,100100521,624,BATTERY - SIMPLE ASSAULT,624.0,,,,IC,Invest Cont
2,100100562,626,INTIMATE PARTNER - SIMPLE ASSAULT,626.0,,,,IC,Invest Cont
3,100100581,624,BATTERY - SIMPLE ASSAULT,624.0,,,,AA,Adult Arrest
4,100100628,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",230.0,,,,AA,Adult Arrest
