In [None]:
# !pip install kaggle
# !pip install numpy pandas numerize lets_plot
# !pip install python-dotenv
# !pip install plotnine

In [2]:
import kaggle
import csv
import os
import json
import numpy as np
import pandas as pd

from numerize import numerize as nz

from lets_plot import *
from lets_plot.mapping import *
LetsPlot.setup_html()
from dotenv import dotenv_values

# Part I: Data Collection

## Setting up credentials

- We aimis to collect raw crime data from Kaggle.
- To collect datasets, what we need to do first is to obtain API credentials from Kaggle. 

In [3]:
config = dotenv_values(".env")

kaggle_username = config["KAGGLE_USERNAME"]
kaggle_key = config["KAGGLE_KEY"]

# Configurer les variables d'environnement pour Kaggle
os.environ["KAGGLE_USERNAME"] = kaggle_username
os.environ["KAGGLE_KEY"] = kaggle_key

- Now we are allowed to use the Kaggle API to interact with Kaggle datasets. We extracted crime data for both Baltimore and Vancouver as our raw data, and store them separately in two dataframes named `df_BALTIMORE` and `df_VANCOUVER`.

In [4]:
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

# Save the credentials to the kaggle.json file
api_token = {"username": kaggle_username, "key": kaggle_key}
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as file:
    json.dump(api_token, file)

# Set permissions for the kaggle.json file
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

# Define the download path
download_path = r'../data/raw/baltimore.csv'


# Create the download directory if it does not exist
os.makedirs(download_path, exist_ok=True)

# Download the dataset to the specified path
kaggle.api.dataset_download_files('sohier/crime-in-baltimore', path=download_path, unzip=True)

# Load the dataset into a pandas DataFrame
# Assuming the dataset has a CSV file, you may need to adjust the file name based on the actual dataset
csv_file = [file for file in os.listdir(download_path) if file.endswith('.csv')][0]
csv_path = os.path.join(download_path, csv_file)
df_BALTIMORE = pd.read_csv(csv_path)
df_BALTIMORE=df_BALTIMORE.drop(columns=['CrimeCode','Inside/Outside','Weapon','Post','District','Location 1','Premise'])
df_BALTIMORE['CrimeDate'] = pd.to_datetime(df_BALTIMORE['CrimeDate'])
df_BALTIMORE['City']='Baltimore'
df_BALTIMORE

Dataset URL: https://www.kaggle.com/datasets/sohier/crime-in-baltimore


Unnamed: 0,CrimeDate,CrimeTime,Location,Description,Neighborhood,Longitude,Latitude,Total Incidents,City
0,2017-09-02,23:30:00,4200 AUDREY AVE,ROBBERY - RESIDENCE,Brooklyn,-76.60541,39.22951,1,Baltimore
1,2017-09-02,23:00:00,800 NEWINGTON AVE,AUTO THEFT,Reservoir Hill,-76.63217,39.31360,1,Baltimore
2,2017-09-02,22:53:00,600 RADNOR AV,SHOOTING,Winston-Govans,-76.60697,39.34768,1,Baltimore
3,2017-09-02,22:50:00,1800 RAMSAY ST,AGG. ASSAULT,Carrollton Ridge,-76.64526,39.28315,1,Baltimore
4,2017-09-02,22:31:00,100 LIGHT ST,COMMON ASSAULT,Downtown West,-76.61365,39.28756,1,Baltimore
...,...,...,...,...,...,...,...,...,...
276524,2012-01-01,00:00:00,1400 JOH AVE,LARCENY,Violetville,-76.67195,39.26132,1,Baltimore
276525,2012-01-01,00:00:00,5500 SINCLAIR LN,LARCENY,Frankford,-76.53829,39.32493,1,Baltimore
276526,2012-01-01,00:00:00,400 N PATTERSON PK AV,LARCENY,CARE,-76.58497,39.29573,1,Baltimore
276527,2012-01-01,00:00:00,5800 LILLYAN AV,BURGLARY,Glenham-Belhar,-76.54578,39.34701,1,Baltimore


In [5]:
download_path = r'../data/raw/vancouver.csv'

# Create the download directory if it does not exist
os.makedirs(download_path, exist_ok=True)

# Download the dataset to the specified path
kaggle.api.dataset_download_files('wosaku/crime-in-vancouver', path=download_path, unzip=True)

# Load the dataset into a pandas DataFrame
# Assuming the dataset has a CSV file, you may need to adjust the file name based on the actual dataset
csv_file = [file for file in os.listdir(download_path) if file.endswith('.csv')][0]
csv_path = os.path.join(download_path, csv_file)
df_VANCOUVER = pd.read_csv('data/crime.csv')
df_VANCOUVER.head()

Dataset URL: https://www.kaggle.com/datasets/wosaku/crime-in-vancouver


Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.5,5457452.47,49.269802,-123.083763


In [None]:
# save the data to data/raw

df.to_csv("../data/raw/.....csv")