In [18]:
%load_ext autoreload
%autoreload 2

# Processing each task

from pandas import DataFrame
from pathlib import Path
import sys

# Add the project directory to sys.path
project_dir = Path.cwd().parent  # Adjust as needed to point to your project root
sys.path.append(str(project_dir))

# Import the module
from utils.dslabs_functions import get_variable_types, encode_cyclic_variables, dummify
from utils.data_loader import DataLoader


dataloader = DataLoader()
data, target = dataloader.get_security_classification_dataset_and_target()
# data: DataFrame = read_csv("data/stroke_mvi.csv", index_col="id", na_values="")
vars: dict[str, list] = get_variable_types(data)

yes_no: dict[str, int] = {"no": 0, "No": 0, "yes": 1, "Yes": 1}
residence_type_values: dict[str, int] = {"Rural": 0, "Urban": 1}

encoding: dict[str, dict[str, int]] = {
    # "LAW_CAT_CD": residence_type_values,
    # "hypertension": yes_no,
}
df: DataFrame = data.replace(encoding, inplace=False)
df.head()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,CLASS__security
0,167627343,2017-08-01,779.0,"PUBLIC ADMINISTRATION,UNCLASSIFIED FELONY",126.0,MISCELLANEOUS PENAL LAW,PL 215510D,F,K,62,0.0,25-44,M,WHITE,987500.0,157358.0,40.598595,-73.988298,NY
1,140205878,2014-12-16,750.0,RESISTING ARREST,359.0,OFFENSES AGAINST PUBLIC ADMINISTRATION,PL 2053000,M,M,25,4.0,25-44,M,BLACK,1004138.0,226326.0,40.787875,-73.928182,nonNY
2,141497929,2015-03-19,639.0,AGGRAVATED HARASSMENT 2,361.0,OFF. AGNST PUB ORD SENSBLTY & RGHTS TO PRIV,PL 2403002,M,M,30,0.0,18-24,M,WHITE HISPANIC,999751.0,241188.0,40.828675,-73.94399,NY
3,61262720,2009-04-29,969.0,"TRAFFIC,UNCLASSIFIED INFRACTION",881.0,OTHER TRAFFIC INFRACTION,VTL051101A,M,M,28,0.0,25-44,M,BLACK,999537.0,232842.0,40.805768,-73.944782,NY
4,71392833,2010-02-18,567.0,"MARIJUANA, POSSESSION 4 & 5",235.0,DANGEROUS DRUGS,PL 2211001,M,S,120,0.0,25-44,M,WHITE,944398.0,165687.0,40.621368,-74.143557,NY


In [19]:

for v in vars["symbolic"]:
    print(v, data[v].unique())

PD_DESC ['PUBLIC ADMINISTRATION,UNCLASSIFIED FELONY' 'RESISTING ARREST'
 'AGGRAVATED HARASSMENT 2' 'TRAFFIC,UNCLASSIFIED INFRACTION'
 'MARIJUANA, POSSESSION 4 & 5' 'PUBLIC ADMINISTRATION,UNCLASSI'
 'CONTEMPT,CRIMINAL' 'CONTROLLED SUBSTANCE, POSSESSION 7'
 'IDENTITY THFT-1' 'CONTROLLED SUBSTANCE, INTENT TO SELL 5'
 'THEFT OF SERVICES, UNCLASSIFIED' 'RECKLESS ENDANGERMENT 1'
 'INTOXICATED DRIVING,ALCOHOL' 'ASSAULT 3' 'TRESPASS 2, CRIMINAL'
 'LARCENY,GRAND FROM OPEN AREAS,UNCLASSIFIED'
 'MISCHIEF,CRIMINAL     UNCLASSIFIED 4TH DEG' 'ASSAULT 2,1,UNCLASSIFIED'
 'BRIBERY,PUBLIC ADMINISTRATION' 'LARCENY,PETIT FROM OPEN AREAS,'
 'ROBBERY,UNCLASSIFIED,OPEN AREAS' 'TRAFFIC,UNCLASSIFIED MISDEMEAN'
 'MARIJUANA, SALE 4 & 5' 'FORGERY,ETC.-MISD.' 'WEAPONS, POSSESSION, ETC'
 'TRESPASS 3, CRIMINAL' 'LARCENY,GRAND FROM PERSON,UNCLASSIFIED'
 'LARCENY,PETIT FROM OPEN AREAS,UNCLASSIFIED' 'UNAUTHORIZED USE VEHICLE 2'
 'CONTROLLED SUBSTANCE,POSSESS. 3' 'BURGLARY,UNCLASSIFIED,UNKNOWN TIME'
 'TRAFFIC,UNCLASSIFI

## because there are so many symbolic variables with multiple values, we dummify the variables and encode them as numbers.

In [20]:
df = dummify(data, vars["symbolic"])


In [21]:
# Age group mapping
age_map = {"18-24": 1, "25-44": 2, "45-64": 3, "65+": 4, "<18": 0}
df["AGE_GROUP"] = data["AGE_GROUP"].map(age_map).fillna(-1)

# Encode race
race_encoder = {race: idx for idx, race in enumerate(data["PERP_RACE"].unique())}
df["PERP_RACE"] = data["PERP_RACE"].map(race_encoder)

df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,KY_CD,LAW_CAT_CD,ARREST_PRECINCT,JURISDICTION_CODE,PERP_SEX,X_COORD_CD,Y_COORD_CD,...,AGE_GROUP_<18,PERP_RACE_AMERICAN INDIAN/ALASKAN NATIVE,PERP_RACE_ASIAN / PACIFIC ISLANDER,PERP_RACE_BLACK,PERP_RACE_BLACK HISPANIC,PERP_RACE_UNKNOWN,PERP_RACE_WHITE,PERP_RACE_WHITE HISPANIC,AGE_GROUP,PERP_RACE
0,167627343,2017-08-01,779.0,126.0,F,62,0.0,M,987500.0,157358.0,...,False,False,False,False,False,False,True,False,2,0
1,140205878,2014-12-16,750.0,359.0,M,25,4.0,M,1004138.0,226326.0,...,False,False,False,True,False,False,False,False,2,1
2,141497929,2015-03-19,639.0,361.0,M,30,0.0,M,999751.0,241188.0,...,False,False,False,False,False,False,False,True,1,2
3,61262720,2009-04-29,969.0,881.0,M,28,0.0,M,999537.0,232842.0,...,False,False,False,True,False,False,False,False,2,1
4,71392833,2010-02-18,567.0,235.0,M,120,0.0,M,944398.0,165687.0,...,False,False,False,False,False,False,True,False,2,0
