In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.options.display.max_columns = False

## 1. Clean tbl_JuvenileHistory

### Initial Data Inspection


The dataset contains approximately 3 million rows. To avoid memory issues and reduce unnecessary processing, only the first 100 rows will be inspected initially:

- Get an overview of the data types (`dtypes`)
- Identify columns/features worth keeping for the EDA
- Skip any columns that appear to be irrelevant or redundant

This initial check will help streamline the analysis and focus only on useful information.

In [4]:
juvenile_history_path = "tbl_JuvenileHistory.csv"
juvenile_history = pd.read_csv(
    filepath_or_buffer=juvenile_history_path, delimiter="\t", nrows=100
)

In [5]:
juvenile_history.head()

Unnamed: 0,idnJuvenileHistory,idnCase,idnProceeding,idnJuvenile,DATCREATEDON,DATMODIFIEDON
0,5,2046990,3200129,1,2014-09-06 19:24:46.373,
1,6,2047179,3199488,1,2014-09-06 19:24:46.373,
2,7,2047179,3199489,1,2014-09-06 19:24:46.373,
3,8,2047199,3199497,1,2014-09-06 19:24:46.373,
4,9,2047199,3199498,1,2014-09-06 19:24:46.373,


In [6]:
juvenile_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   idnJuvenileHistory  100 non-null    int64  
 1   idnCase             100 non-null    int64  
 2   idnProceeding       100 non-null    int64  
 3   idnJuvenile         100 non-null    int64  
 4   DATCREATEDON        100 non-null    object 
 5   DATMODIFIEDON       0 non-null      float64
dtypes: float64(1), int64(4), object(1)
memory usage: 4.8+ KB


In [7]:
juvenile_history.columns

Index(['idnJuvenileHistory', 'idnCase', 'idnProceeding', 'idnJuvenile',
       'DATCREATEDON', 'DATMODIFIEDON'],
      dtype='object')

**`idnJuvenile`**: Although stored as `int64`, this column only contains values from 1 to 6 (as per `Lookup/tblLookup_Juvenile.csv`). It should be treated as a categorical feature.

**`DATCREATEDON`** and **`DATMODIFIEDON`**:
  - `DATCREATEDON` likely reflects the date when the record was entered into the system, not when the case itself was initiated.
  - `DATMODIFIEDON` contains only null values and may be dropped.

In [8]:
juvenile_history = pd.read_csv(
    filepath_or_buffer=juvenile_history_path,
    delimiter="\t",
    usecols=["idnJuvenileHistory", "idnCase", "idnProceeding", "idnJuvenile"],
    dtype={
        "idnJuvenileHistory": "Int64",
        "idnCase": "Int64",
        "idnProceeding": "Int64",
        "idnJuvenile": "category",
    },
    low_memory=False,
)

### Data Inspection

In [9]:
juvenile_history.head()

Unnamed: 0,idnJuvenileHistory,idnCase,idnProceeding,idnJuvenile
0,5,2046990,3200129,1
1,6,2047179,3199488,1
2,7,2047179,3199489,1
3,8,2047199,3199497,1
4,9,2047199,3199498,1


In [10]:
juvenile_history.shape

(2857093, 4)

In [11]:
juvenile_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2857093 entries, 0 to 2857092
Data columns (total 4 columns):
 #   Column              Dtype   
---  ------              -----   
 0   idnJuvenileHistory  Int64   
 1   idnCase             Int64   
 2   idnProceeding       Int64   
 3   idnJuvenile         category
dtypes: Int64(3), category(1)
memory usage: 76.3 MB


In [12]:
juvenile_history.isna().sum()

idnJuvenileHistory      0
idnCase                 0
idnProceeding          98
idnJuvenile           999
dtype: int64

In [13]:
juvenile_history.duplicated().sum()

0

#### `idnCase` List

Extracted unique `idnCase` values from `tbl_JuvenileHistory`  
to filter `A_TblCase` and retain only juvenile-related records  
without performing a full table join.

In [14]:
juvenile_case_ids = juvenile_history["idnCase"].unique()

#### `idnProceeding` List

Extracted unique `idnProceeding` values from `tbl_JuvenileHistory`  
to filter `B_TblProceeding` and retain only juvenile-related records  
without performing a full table join.

In [15]:
juvenile_proceeding_ids = juvenile_history["idnProceeding"].dropna().unique()

In [16]:
juvenile_history.to_csv(
    "../outputs/juvenile_history_cleaned.csv.gz", index=False, compression="gzip"
)

### Cleaned `tbl_JuvenileHistory`

- Loaded ~2.85M rows from the raw CSV file.
- Dropped irrelevant or fully null columns.
- Converted `idnJuvenile` to a categorical variable.
- Retained only 4 key fields:
  - `idnJuvenileHistory`: primary key
  - `idnCase`: foreign key to `tbl_Case`
  - `idnProceeding`: foreign key to `tbl_Proceeding`
  - `idnJuvenile`: foreign key to `tblLookup_Juvenile`
- Missing values:
  - `idnProceeding`: 98 missing
  - `idnJuvenile`: 999 missing

Saved cleaned file as:
- `juvenile_history_cleaned.csv.gz` 

## 2. Clean A_TblCase

### Initial Data Inspection


The dataset contains approximately 12 million rows. To avoid memory issues and reduce unnecessary processing, only the first 1000 rows will be inspected initially:

- Get an overview of the data types (`dtypes`)
- Identify columns/features worth keeping for the EDA
- Skip any columns that appear to be irrelevant or redundant

This initial check will help streamline the analysis and focus only on useful information.

In [17]:
case_path = "A_TblCase.csv"
cases = pd.read_csv(filepath_or_buffer=case_path, delimiter="\t", nrows=1000)

In [18]:
cases.head()

Unnamed: 0,IDNCASE,ALIEN_CITY,ALIEN_STATE,ALIEN_ZIPCODE,UPDATED_ZIPCODE,UPDATED_CITY,NAT,LANG,CUSTODY,SITE_TYPE,E_28_DATE,ATTY_NBR,CASE_TYPE,UPDATE_SITE,LATEST_HEARING,LATEST_TIME,LATEST_CAL_TYPE,UP_BOND_DATE,UP_BOND_RSN,CORRECTIONAL_FAC,RELEASE_MONTH,RELEASE_YEAR,INMATE_HOUSING,DATE_OF_ENTRY,C_ASY_TYPE,C_BIRTHDATE,C_RELEASE_DATE,UPDATED_STATE,ADDRESS_CHANGEDON,ZBOND_MRG_FLAG,GENDER,DATE_DETAINED,DATE_RELEASED,LPR,DETENTION_DATE,DETENTION_LOCATION,DCO_LOCATION,DETENTION_FACILITY_TYPE,CASEPRIORITY_CODE
0,11782069,SAINT CHARLES,IL,60174,,,VE,SP,N,M,,,RMV,CHI,2026-01-20 00:00:00.000,900.0,M,,,,,,,2023-05-04 00:00:00.000,,11/1999,,,,,F,,,,,,,,
1,11782070,PLYMOUTH,MA,2360,,,EC,SP,D,M,,,RMV,CHE,,,,,,,,,,2023-04-30 00:00:00.000,,11/1983,,,2024-11-19 08:32:39.000,,M,2024-11-14 00:00:00.000,,,,,,,
2,11782071,WEST SACRAMENTO,CA,95691,,,RU,RUS,N,M,,,RMV,SFR,2026-08-19 00:00:00.000,1330.0,M,,,,,,,2023-05-04 00:00:00.000,E,5/1985,,,2024-11-01 12:22:02.000,,F,,,,,,,,
3,11782072,SAN JOSE,CA,95117,,,MX,SP,N,M,2024-04-19 11:32:17.227,0.0,RMV,SFR,2028-02-17 00:00:00.000,1330.0,M,,,,,,,2023-05-03 00:00:00.000,E,9/1999,,,2025-03-17 14:07:53.000,,F,,,,,,,,
4,11782074,KEARNS,UT,84118,,,VE,SP,N,M,,,RMV,SLC,2025-11-25 00:00:00.000,1400.0,M,,,,,,,2023-04-30 00:00:00.000,E,9/2004,,,,,F,,,,,,,,


In [19]:
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   IDNCASE                  1000 non-null   int64  
 1   ALIEN_CITY               958 non-null    object 
 2   ALIEN_STATE              974 non-null    object 
 3   ALIEN_ZIPCODE            974 non-null    object 
 4   UPDATED_ZIPCODE          0 non-null      float64
 5   UPDATED_CITY             0 non-null      float64
 6   NAT                      1000 non-null   object 
 7   LANG                     1000 non-null   object 
 8   CUSTODY                  1000 non-null   object 
 9   SITE_TYPE                878 non-null    object 
 10  E_28_DATE                271 non-null    object 
 11  ATTY_NBR                 526 non-null    float64
 12  CASE_TYPE                1000 non-null   object 
 13  UPDATE_SITE              1000 non-null   object 
 14  LATEST_HEARING           

In [20]:
cases.columns

Index(['IDNCASE', 'ALIEN_CITY', 'ALIEN_STATE', 'ALIEN_ZIPCODE',
       'UPDATED_ZIPCODE', 'UPDATED_CITY', 'NAT', 'LANG', 'CUSTODY',
       'SITE_TYPE', 'E_28_DATE', 'ATTY_NBR', 'CASE_TYPE', 'UPDATE_SITE',
       'LATEST_HEARING', 'LATEST_TIME', 'LATEST_CAL_TYPE', 'UP_BOND_DATE',
       'UP_BOND_RSN', 'CORRECTIONAL_FAC', 'RELEASE_MONTH', 'RELEASE_YEAR',
       'INMATE_HOUSING', 'DATE_OF_ENTRY', 'C_ASY_TYPE', 'C_BIRTHDATE',
       'C_RELEASE_DATE', 'UPDATED_STATE', 'ADDRESS_CHANGEDON',
       'ZBOND_MRG_FLAG', 'GENDER', 'DATE_DETAINED', 'DATE_RELEASED', 'LPR',
       'DETENTION_DATE', 'DETENTION_LOCATION', 'DCO_LOCATION',
       'DETENTION_FACILITY_TYPE', 'CASEPRIORITY_CODE'],
      dtype='object')

### Selected Features for EDA – `A_TblCase`

The list of selected columns below was discussed earlier in the documentation for the source dataset. It represents the core case-level features relevant to our analysis of juvenile immigration cases.

These fields include:

- Demographic information (e.g., `GENDER`, `NAT`, `LANG`)  
- Case characteristics (e.g., `CASE_TYPE`, `CUSTODY`, `LATEST_HEARING`)  
- Key dates (e.g., `DATE_OF_ENTRY`, `DETENTION_DATE`, `C_BIRTHDATE`)

In [21]:
selected_columns = [
    "IDNCASE",
    "NAT",
    "LANG",
    "CUSTODY",
    "CASE_TYPE",
    "LATEST_HEARING",
    "LATEST_CAL_TYPE",
    "DATE_OF_ENTRY",
    "C_BIRTHDATE",
    "GENDER",
    "DATE_DETAINED",
    "DATE_RELEASED",
    "DETENTION_DATE",
]

In [22]:
cases = cases[selected_columns]

In [23]:
cases.head()

Unnamed: 0,IDNCASE,NAT,LANG,CUSTODY,CASE_TYPE,LATEST_HEARING,LATEST_CAL_TYPE,DATE_OF_ENTRY,C_BIRTHDATE,GENDER,DATE_DETAINED,DATE_RELEASED,DETENTION_DATE
0,11782069,VE,SP,N,RMV,2026-01-20 00:00:00.000,M,2023-05-04 00:00:00.000,11/1999,F,,,
1,11782070,EC,SP,D,RMV,,,2023-04-30 00:00:00.000,11/1983,M,2024-11-14 00:00:00.000,,
2,11782071,RU,RUS,N,RMV,2026-08-19 00:00:00.000,M,2023-05-04 00:00:00.000,5/1985,F,,,
3,11782072,MX,SP,N,RMV,2028-02-17 00:00:00.000,M,2023-05-03 00:00:00.000,9/1999,F,,,
4,11782074,VE,SP,N,RMV,2025-11-25 00:00:00.000,M,2023-04-30 00:00:00.000,9/2004,F,,,


In [24]:
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   IDNCASE          1000 non-null   int64  
 1   NAT              1000 non-null   object 
 2   LANG             1000 non-null   object 
 3   CUSTODY          1000 non-null   object 
 4   CASE_TYPE        1000 non-null   object 
 5   LATEST_HEARING   859 non-null    object 
 6   LATEST_CAL_TYPE  854 non-null    object 
 7   DATE_OF_ENTRY    643 non-null    object 
 8   C_BIRTHDATE      558 non-null    object 
 9   GENDER           554 non-null    object 
 10  DATE_DETAINED    39 non-null     object 
 11  DATE_RELEASED    5 non-null      object 
 12  DETENTION_DATE   0 non-null      float64
dtypes: float64(1), int64(1), object(11)
memory usage: 101.7+ KB


#### Note on `DETENTION_DATE`

This field is entirely null in the sample, but we'll reassess after  
loading the full dataset.

In [25]:
cases.dtypes

IDNCASE              int64
NAT                 object
LANG                object
CUSTODY             object
CASE_TYPE           object
LATEST_HEARING      object
LATEST_CAL_TYPE     object
DATE_OF_ENTRY       object
C_BIRTHDATE         object
GENDER              object
DATE_DETAINED       object
DATE_RELEASED       object
DETENTION_DATE     float64
dtype: object

#### Specifying Column Data Types

- `Int64`: Used for `IDNCASE` to allow nullable integer values.
- `category`: Applied to string columns with repeated values  
  (e.g., `NAT`, `LANG`, `CUSTODY`, `CASE_TYPE`, `LATEST_CAL_TYPE`, `GENDER`)  
  for efficient storage and faster processing.
- `string`: Used for `C_BIRTHDATE` since it uses a partial date format (`MM/YYYY`)  
  and may contain nulls.
- `float64`: Used for `DETENTION_DATE`, which appears to contain only nulls in the  
  sample but may include numeric timestamps or intervals in the full dataset.

In [26]:
dtype = {
    "IDNCASE": "Int64",
    "NAT": "category",
    "LANG": "category",
    "CUSTODY": "category",
    "CASE_TYPE": "category",
    "LATEST_CAL_TYPE": "category",
    "GENDER": "category",
    "C_BIRTHDATE": "string",
    "DETENTION_DATE": "float64",
}

In [27]:
cases = pd.read_csv(
    filepath_or_buffer=case_path,
    delimiter="\t",
    on_bad_lines="skip",
    usecols=selected_columns,
    dtype=dtype,
    parse_dates=["LATEST_HEARING", "DATE_OF_ENTRY", "DATE_DETAINED", "DATE_RELEASED"],
    low_memory=False,
    skiprows=[11711221],  # skip malformed row: C error - EOF inside string on this line
)

### Data Inspection

In [28]:
cases.head()

Unnamed: 0,IDNCASE,NAT,LANG,CUSTODY,CASE_TYPE,LATEST_HEARING,LATEST_CAL_TYPE,DATE_OF_ENTRY,C_BIRTHDATE,GENDER,DATE_DETAINED,DATE_RELEASED,DETENTION_DATE
0,11782069,VE,SP,N,RMV,2026-01-20 00:00:00.000,M,2023-05-04 00:00:00.000,11/1999,F,,,
1,11782070,EC,SP,D,RMV,,,2023-04-30 00:00:00.000,11/1983,M,2024-11-14 00:00:00.000,,
2,11782071,RU,RUS,N,RMV,2026-08-19 00:00:00.000,M,2023-05-04 00:00:00.000,5/1985,F,,,
3,11782072,MX,SP,N,RMV,2028-02-17 00:00:00.000,M,2023-05-03 00:00:00.000,9/1999,F,,,
4,11782074,VE,SP,N,RMV,2025-11-25 00:00:00.000,M,2023-04-30 00:00:00.000,9/2004,F,,,


In [29]:
cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11711220 entries, 0 to 11711219
Data columns (total 13 columns):
 #   Column           Dtype   
---  ------           -----   
 0   IDNCASE          Int64   
 1   NAT              category
 2   LANG             category
 3   CUSTODY          category
 4   CASE_TYPE        category
 5   LATEST_HEARING   object  
 6   LATEST_CAL_TYPE  category
 7   DATE_OF_ENTRY    object  
 8   C_BIRTHDATE      string  
 9   GENDER           category
 10  DATE_DETAINED    object  
 11  DATE_RELEASED    object  
 12  DETENTION_DATE   float64 
dtypes: Int64(1), category(6), float64(1), object(4), string(1)
memory usage: 726.0+ MB


In [30]:
cases.shape

(11711220, 13)

#### Filtering for Juvenile Cases

The current `cases` DataFrame includes both adult and juvenile records.  
To isolate only juvenile cases, we will filter it using the list of  
`idnCase` values from `tbl_JuvenileHistory`, without performing a full merge.

In [31]:
juvenile_cases = cases[cases["IDNCASE"].isin(juvenile_case_ids)].reset_index(drop=True)

#### Category Cleanup

Removed unused category levels after filtering to ensure all categorical columns reflect only the values present in the `juvenile_cases` subset.

In [32]:
for col in juvenile_cases.select_dtypes(include="category"):
    juvenile_cases[col] = juvenile_cases[col].cat.remove_unused_categories()

In [33]:
juvenile_cases.head()

Unnamed: 0,IDNCASE,NAT,LANG,CUSTODY,CASE_TYPE,LATEST_HEARING,LATEST_CAL_TYPE,DATE_OF_ENTRY,C_BIRTHDATE,GENDER,DATE_DETAINED,DATE_RELEASED,DETENTION_DATE
0,13758313,GT,SP,N,RMV,,,2024-01-21 00:00:00.000,2/2008,F,,,
1,14870586,MX,SP,D,RFR,2025-02-04 00:00:00.000,I,,,,2025-01-22 00:00:00.000,,
2,14870588,GT,SP,R,WHO,2025-07-31 00:00:00.000,M,,6/1997,F,2025-01-29 00:00:00.000,2025-02-06 00:00:00.000,
3,13816559,MX,SP,N,RMV,2027-02-18 00:00:00.000,I,,5/2020,M,,,
4,13816560,MX,SP,N,RMV,2027-02-18 00:00:00.000,I,,5/2021,F,,,


In [34]:
juvenile_cases.shape

(1858773, 13)

In [35]:
juvenile_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1858773 entries, 0 to 1858772
Data columns (total 13 columns):
 #   Column           Dtype   
---  ------           -----   
 0   IDNCASE          Int64   
 1   NAT              category
 2   LANG             category
 3   CUSTODY          category
 4   CASE_TYPE        category
 5   LATEST_HEARING   object  
 6   LATEST_CAL_TYPE  category
 7   DATE_OF_ENTRY    object  
 8   C_BIRTHDATE      string  
 9   GENDER           category
 10  DATE_DETAINED    object  
 11  DATE_RELEASED    object  
 12  DETENTION_DATE   float64 
dtypes: Int64(1), category(6), float64(1), object(4), string(1)
memory usage: 115.2+ MB


### Missing Values Summary

Counts and percentages of missing values were calculated for each column in the `juvenile_cases` dataset to assess data completeness.  
`.isna().sum()` computes the total number of missing entries, and the percentage is derived by dividing by the total number of rows.  
Columns are sorted by missing count to highlight those requiring attention during data cleaning.

In [36]:
null_counts = juvenile_cases.isna().sum()
percent_missing = (null_counts / len(juvenile_cases)) * 100

missing_summary = pd.DataFrame(
    {"Missing Count": null_counts, "Missing %": percent_missing.round(2)}
).sort_values(by="Missing Count", ascending=False)

display(missing_summary)

Unnamed: 0,Missing Count,Missing %
DETENTION_DATE,1858773,100.0
DATE_RELEASED,1411804,75.95
DATE_DETAINED,1053320,56.67
GENDER,488535,26.28
DATE_OF_ENTRY,480722,25.86
C_BIRTHDATE,446724,24.03
LATEST_CAL_TYPE,303538,16.33
LATEST_HEARING,296813,15.97
NAT,3057,0.16
LANG,1692,0.09


The `DETENTION_DATE` column is entirely null (100% missing) and therefore will be dropped from the dataset

In [37]:
juvenile_cases = juvenile_cases.drop("DETENTION_DATE", axis=1)

#### Datetime Format Validation

The following date-related features will be the focus of the next stage of preprocessing:  
- `LATEST_HEARING`  
- `DATE_OF_ENTRY`  
- `C_BIRTHDATE`  
- `DATE_DETAINED`  
- `DATE_RELEASED`  

These features may require format standardization and conversion to datetime objects to enable accurate temporal analysis.

Every datetime feature (except `C_BIRTHDATE`) follows the format `'YYYY-MM-DD 00:00:00.000'` (e.g., `'2025-02-04 00:00:00.000'`).

Before conversion, each feature will be tested against this pattern to ensure values are valid.  
All non-null entries will be checked to avoid unintended data loss during transformation with `pd.to_datetime()`.

Only the **`YYYY-MM-DD`** portion of each timestamp will be retained.

In [38]:
def find_invalid_dates(df, column):
    """
    Returns non-null rows that don’t match the `YYYY-MM-DD` pattern.
    """
    return df[
        df[column].notna()
        & ~df[column].astype(str).str.contains(r"\d{4}-\d{2}-\d{2}", regex=True)
    ][[column]]

In [39]:
invalid_latest_hearing = find_invalid_dates(juvenile_cases, "LATEST_HEARING")
invalid_date_of_entry = find_invalid_dates(juvenile_cases, "DATE_OF_ENTRY")
invalid_date_detained = find_invalid_dates(juvenile_cases, "DATE_DETAINED")
invalid_date_released = find_invalid_dates(juvenile_cases, "DATE_RELEASED")

In [40]:
def report_invalid(name, df):
    """
    Prints the number of invalid entries and displays the DataFrame if not empty.
    """
    count = len(df)
    print(f"{name}: {count} invalid entr{'y' if count == 1 else 'ies'}")
    if count > 0:
        display(df)

In [41]:
report_invalid("LATEST_HEARING", invalid_latest_hearing)
report_invalid("DATE_OF_ENTRY", invalid_date_of_entry)
report_invalid("DATE_DETAINED", invalid_date_detained)
report_invalid("DATE_RELEASED", invalid_date_released)

LATEST_HEARING: 9 invalid entries


Unnamed: 0,LATEST_HEARING
1676217,SFR
1678888,PHI
1701983,NEW
1774914,NYV
1775104,NYV
1788271,PHI
1790694,HAR
1796974,DAL
1842725,SNA


DATE_OF_ENTRY: 0 invalid entries
DATE_DETAINED: 7 invalid entries


Unnamed: 0,DATE_DETAINED
1676217,M
1678888,F
1774914,M
1775104,M
1788271,M
1790694,F
1842725,M


DATE_RELEASED: 0 invalid entries


Invalid date values were not counted separately, as `errors='coerce'` was used in `pd.to_datetime()`.  
This automatically handles invalid formats by converting them to `NaT`, simplifying the cleaning process.

In [42]:
date_cols = ["LATEST_HEARING", "DATE_OF_ENTRY", "DATE_DETAINED", "DATE_RELEASED"]

juvenile_cases[date_cols] = juvenile_cases[date_cols].apply(
    lambda col: pd.to_datetime(col, errors="coerce")
)

Identified non-null `C_BIRTHDATE` values that do not match the expected `MM/YYYY` format.  
This helps reveal alternate formats and prevents unintended data loss during conversion.

In [43]:
invalid_birthdates = juvenile_cases[
    juvenile_cases["C_BIRTHDATE"].notna()
    & ~juvenile_cases["C_BIRTHDATE"]
    .astype(str)
    .str.contains(r"^\d{1,2}/\d{4}$", regex=True)
][["C_BIRTHDATE"]]

In [44]:
print(f"Invalid C_BIRTHDATE entries: {len(invalid_birthdates)}")
display(invalid_birthdates)

Invalid C_BIRTHDATE entries: 7


Unnamed: 0,C_BIRTHDATE
1678888,E
1774914,E
1775104,E
1788271,E
1790694,E
1796974,E
1842725,E


In [45]:
juvenile_cases["C_BIRTHDATE"] = pd.to_datetime(
    juvenile_cases["C_BIRTHDATE"], format="%m/%Y", errors="coerce"
)

#### Date Conversion Check

Confirmed that all date columns were successfully converted to `datetime64[ns]` format.

In [46]:
date_cols = [
    "LATEST_HEARING",
    "DATE_OF_ENTRY",
    "DATE_DETAINED",
    "DATE_RELEASED",
    "C_BIRTHDATE",
]
print(juvenile_cases[date_cols].dtypes)

LATEST_HEARING    datetime64[ns]
DATE_OF_ENTRY     datetime64[ns]
DATE_DETAINED     datetime64[ns]
DATE_RELEASED     datetime64[ns]
C_BIRTHDATE       datetime64[ns]
dtype: object


In [47]:
juvenile_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1858773 entries, 0 to 1858772
Data columns (total 12 columns):
 #   Column           Dtype         
---  ------           -----         
 0   IDNCASE          Int64         
 1   NAT              category      
 2   LANG             category      
 3   CUSTODY          category      
 4   CASE_TYPE        category      
 5   LATEST_HEARING   datetime64[ns]
 6   LATEST_CAL_TYPE  category      
 7   DATE_OF_ENTRY    datetime64[ns]
 8   C_BIRTHDATE      datetime64[ns]
 9   GENDER           category      
 10  DATE_DETAINED    datetime64[ns]
 11  DATE_RELEASED    datetime64[ns]
dtypes: Int64(1), category(6), datetime64[ns](5)
memory usage: 101.0 MB


In [48]:
juvenile_cases.head()

Unnamed: 0,IDNCASE,NAT,LANG,CUSTODY,CASE_TYPE,LATEST_HEARING,LATEST_CAL_TYPE,DATE_OF_ENTRY,C_BIRTHDATE,GENDER,DATE_DETAINED,DATE_RELEASED
0,13758313,GT,SP,N,RMV,NaT,,2024-01-21,2008-02-01,F,NaT,NaT
1,14870586,MX,SP,D,RFR,2025-02-04,I,NaT,NaT,,2025-01-22,NaT
2,14870588,GT,SP,R,WHO,2025-07-31,M,NaT,1997-06-01,F,2025-01-29,2025-02-06
3,13816559,MX,SP,N,RMV,2027-02-18,I,NaT,2020-05-01,M,NaT,NaT
4,13816560,MX,SP,N,RMV,2027-02-18,I,NaT,2021-05-01,F,NaT,NaT


#### Categorical Value Cleanup

Standardized and cleaned values in key categorical features (`NAT`, `LANG`, `CUSTODY`, `CASE_TYPE`, `LATEST_CAL_TYPE`, `GENDER`)  
to ensure consistency and prevent issues caused by typos or rare variants.

`NAT` and `LANG` contain a large number of unique values.  
These were retained in full to preserve detail and will be grouped or simplified as needed during analysis.

In [49]:
juvenile_cases["CUSTODY"].value_counts()

CUSTODY
N      1010878
R       471998
D       375884
SP           8
POR          1
Name: count, dtype: int64

In [50]:
juvenile_cases["CASE_TYPE"].value_counts()

CASE_TYPE
RMV    1721295
CFR      88087
RFR      21395
WHO      21372
AOC       4358
DEP       1830
REC        218
EXC        153
CSR         51
0            5
AOL          2
NAC          2
DCC          1
Name: count, dtype: int64

In [51]:
juvenile_cases["LATEST_CAL_TYPE"].value_counts()

LATEST_CAL_TYPE
M       1069154
I        486049
N            22
0900          4
1300          2
R             2
0830          1
1030          1
Name: count, dtype: int64

Cleaned `LATEST_CAL_TYPE` by keeping known values (`M` = Master, `I` = Individual) and replacing unexpected entries (e.g., time strings or unknown codes) with `NaN`.

In [52]:
valid_types = ["M", "I"]

juvenile_cases["LATEST_CAL_TYPE"] = (
    juvenile_cases["LATEST_CAL_TYPE"]
    .apply(lambda x: x if x in valid_types else pd.NA)
    .astype("category")
)

juvenile_cases["LATEST_CAL_TYPE"] = juvenile_cases[
    "LATEST_CAL_TYPE"
].cat.remove_unused_categories()

In [53]:
juvenile_cases["GENDER"].value_counts()

GENDER
M    875507
F    494724
N         6
U         1
Name: count, dtype: int64

Cleaned `GENDER` by keeping known values (`M` = Male, `F` = Female) and replaced rare or unclear codes (`N`, `U`) with `NaN` to ensure consistency and avoid ambiguity in gender-related analysis.

In [54]:
juvenile_cases["GENDER"] = (
    juvenile_cases["GENDER"]
    .apply(lambda x: x if x in ["M", "F"] else pd.NA)
    .astype("category")
)

juvenile_cases["GENDER"] = juvenile_cases["GENDER"].cat.remove_unused_categories()

In [55]:
juvenile_cases.head()

Unnamed: 0,IDNCASE,NAT,LANG,CUSTODY,CASE_TYPE,LATEST_HEARING,LATEST_CAL_TYPE,DATE_OF_ENTRY,C_BIRTHDATE,GENDER,DATE_DETAINED,DATE_RELEASED
0,13758313,GT,SP,N,RMV,NaT,,2024-01-21,2008-02-01,F,NaT,NaT
1,14870586,MX,SP,D,RFR,2025-02-04,I,NaT,NaT,,2025-01-22,NaT
2,14870588,GT,SP,R,WHO,2025-07-31,M,NaT,1997-06-01,F,2025-01-29,2025-02-06
3,13816559,MX,SP,N,RMV,2027-02-18,I,NaT,2020-05-01,M,NaT,NaT
4,13816560,MX,SP,N,RMV,2027-02-18,I,NaT,2021-05-01,F,NaT,NaT


In [56]:
juvenile_cases.to_csv(
    "../outputs/juvenile_cases_cleaned.csv.gz", index=False, compression="gzip"
)

### Cleaned `tbl_Case`

- Filtered to ~1.86M juvenile-related cases based on `IDNCASE`.
- Reset index after filtering.
- Removed unused categories from categorical columns.
- Converted date fields to `datetime64[ns]`:
  - `LATEST_HEARING`
  - `DATE_OF_ENTRY`
  - `C_BIRTHDATE` (parsed from `MM/YYYY` format)
  - `DATE_DETAINED`
  - `DATE_RELEASED`
- Cleaned categorical features:
  - `LATEST_CAL_TYPE`: kept only `M` (Master) and `I` (Individual); others set to `NaN`
  - `GENDER`: kept only `M` and `F`; others set to `NaN`

Saved cleaned file as:  
`cases_juvenile_cleaned.csv.gz`

## 3. Clean B_TblProceeding

### Initial Data Inspection


The dataset contains approximately 12 million rows. To avoid memory issues and reduce unnecessary processing, only the first 1000 rows will be inspected initially:

- Get an overview of the data types (`dtypes`)
- Identify columns/features worth keeping for the EDA
- Skip any columns that appear to be irrelevant or redundant

This initial check will help streamline the analysis and focus only on useful information.

In [57]:
proceedings_path = "B_TblProceeding.csv"
proceedings = pd.read_csv(
    filepath_or_buffer=proceedings_path, delimiter="\t", nrows=1000
)

In [58]:
proceedings.head()

Unnamed: 0,IDNPROCEEDING,IDNCASE,OSC_DATE,INPUT_DATE,BASE_CITY_CODE,HEARING_LOC_CODE,IJ_CODE,TRANS_IN_DATE,PREV_HEARING_LOC,PREV_HEARING_BASE,PREV_IJ_CODE,TRANS_NBR,HEARING_DATE,HEARING_TIME,DEC_TYPE,DEC_CODE,DEPORTED_1,DEPORTED_2,OTHER_COMP,APPEAL_RSVD,APPEAL_NOT_FILED,COMP_DATE,ABSENTIA,VENUE_CHG_GRANTED,TRANSFER_TO,DATE_APPEAL_DUE_STATUS,TRANSFER_STATUS,CUSTODY,CASE_TYPE,NAT,LANG,SCHEDULED_HEAR_LOC,CORRECTIONAL_FAC,CRIM_IND,IHP,AGGRAVATE_FELON,DATE_DETAINED,DATE_RELEASED
0,1895123,3562707,1996-03-11 00:00:00.000,1996-05-08 00:00:00.000,DET,DET,MKS,,,,,0.0,1997-01-16 00:00:00.000,100.0,W,D,HO,,,,,1997-01-16 00:00:00.000,Y,,,1997-02-18 00:00:00.000,,N,DEP,HO,SP,DET,,N,,,,
1,1895124,3562708,1996-03-13 00:00:00.000,1996-05-08 00:00:00.000,DET,DET,MKS,,,,,0.0,1997-01-16 00:00:00.000,100.0,W,D,ES,,,,,1997-01-16 00:00:00.000,Y,,,1997-02-18 00:00:00.000,,N,DEP,ES,SP,DET,,N,,,,
2,1895125,3562709,1996-03-13 00:00:00.000,1996-05-02 00:00:00.000,OAK,ADC,CAW,,,,,0.0,1996-05-09 00:00:00.000,100.0,7,D,GT,,,,,1996-05-09 00:00:00.000,N,,,,,D,DEP,GT,SP,ADC,,Y,,,,
3,1895126,3562710,1997-02-10 00:00:00.000,1997-03-24 00:00:00.000,DET,DET,JFW,,,,,0.0,1997-06-17 00:00:00.000,900.0,O,D,MX,,,,,1997-06-17 00:00:00.000,Y,,,1997-07-17 00:00:00.000,,N,DEP,MX,SP,DET,,N,,,,
4,1895127,3562711,1996-03-18 00:00:00.000,1996-05-09 00:00:00.000,OAK,ADC,JAD,,,,,0.0,1996-05-13 00:00:00.000,1000.0,7,D,MX,,,,,1996-05-13 00:00:00.000,N,,,,,D,DEP,MX,SP,ADC,,Y,,,,


In [59]:
proceedings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   IDNPROCEEDING           1000 non-null   int64  
 1   IDNCASE                 1000 non-null   int64  
 2   OSC_DATE                988 non-null    object 
 3   INPUT_DATE              986 non-null    object 
 4   BASE_CITY_CODE          1000 non-null   object 
 5   HEARING_LOC_CODE        1000 non-null   object 
 6   IJ_CODE                 981 non-null    object 
 7   TRANS_IN_DATE           152 non-null    object 
 8   PREV_HEARING_LOC        326 non-null    object 
 9   PREV_HEARING_BASE       326 non-null    object 
 10  PREV_IJ_CODE            321 non-null    object 
 11  TRANS_NBR               358 non-null    float64
 12  HEARING_DATE            954 non-null    object 
 13  HEARING_TIME            954 non-null    float64
 14  DEC_TYPE                826 non-null    o

In [60]:
proceedings.columns

Index(['IDNPROCEEDING', 'IDNCASE', 'OSC_DATE', 'INPUT_DATE', 'BASE_CITY_CODE',
       'HEARING_LOC_CODE', 'IJ_CODE', 'TRANS_IN_DATE', 'PREV_HEARING_LOC',
       'PREV_HEARING_BASE', 'PREV_IJ_CODE', 'TRANS_NBR', 'HEARING_DATE',
       'HEARING_TIME', 'DEC_TYPE', 'DEC_CODE', 'DEPORTED_1', 'DEPORTED_2',
       'OTHER_COMP', 'APPEAL_RSVD', 'APPEAL_NOT_FILED', 'COMP_DATE',
       'ABSENTIA', 'VENUE_CHG_GRANTED', 'TRANSFER_TO',
       'DATE_APPEAL_DUE_STATUS', 'TRANSFER_STATUS', 'CUSTODY', 'CASE_TYPE',
       'NAT', 'LANG', 'SCHEDULED_HEAR_LOC', 'CORRECTIONAL_FAC', 'CRIM_IND',
       'IHP', 'AGGRAVATE_FELON', 'DATE_DETAINED', 'DATE_RELEASED'],
      dtype='object')

### Selected Features for EDA – `B_TblProceeding`

The selected columns below are core proceeding-level features relevant to analyzing juvenile immigration cases. They were chosen based on the source dataset documentation and their importance for understanding case timelines, decisions, and access to justice.

These fields include:

- **Case timeline**: `OSC_DATE`, `COMP_DATE`, `INPUT_DATE`
- **Case outcomes**: `DEC_CODE`, `ABSENTIA`, `CRIM_IND`
- **Demographics & access**: `NAT`, `LANG`, `CUSTODY`, `CASE_TYPE`
- **Court geography**: `BASE_CITY_CODE`, `HEARING_LOC_CODE`

These variables allow us to track the duration of proceedings, categorize outcomes, and evaluate regional disparities and vulnerability factors.

In [61]:
selected_columns = [
    "IDNPROCEEDING",
    "IDNCASE",
    "OSC_DATE",
    "INPUT_DATE",
    "COMP_DATE",
    "BASE_CITY_CODE",
    "HEARING_LOC_CODE",
    "DEC_CODE",
    "ABSENTIA",
    "CRIM_IND",
    "NAT",
    "LANG",
    "CASE_TYPE",
    "CUSTODY",
    "DATE_DETAINED",
    "DATE_RELEASED",
]

In [62]:
proceedings = proceedings[selected_columns]

In [63]:
proceedings.head()

Unnamed: 0,IDNPROCEEDING,IDNCASE,OSC_DATE,INPUT_DATE,COMP_DATE,BASE_CITY_CODE,HEARING_LOC_CODE,DEC_CODE,ABSENTIA,CRIM_IND,NAT,LANG,CASE_TYPE,CUSTODY,DATE_DETAINED,DATE_RELEASED
0,1895123,3562707,1996-03-11 00:00:00.000,1996-05-08 00:00:00.000,1997-01-16 00:00:00.000,DET,DET,D,Y,N,HO,SP,DEP,N,,
1,1895124,3562708,1996-03-13 00:00:00.000,1996-05-08 00:00:00.000,1997-01-16 00:00:00.000,DET,DET,D,Y,N,ES,SP,DEP,N,,
2,1895125,3562709,1996-03-13 00:00:00.000,1996-05-02 00:00:00.000,1996-05-09 00:00:00.000,OAK,ADC,D,N,Y,GT,SP,DEP,D,,
3,1895126,3562710,1997-02-10 00:00:00.000,1997-03-24 00:00:00.000,1997-06-17 00:00:00.000,DET,DET,D,Y,N,MX,SP,DEP,N,,
4,1895127,3562711,1996-03-18 00:00:00.000,1996-05-09 00:00:00.000,1996-05-13 00:00:00.000,OAK,ADC,D,N,Y,MX,SP,DEP,D,,


In [64]:
proceedings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   IDNPROCEEDING     1000 non-null   int64 
 1   IDNCASE           1000 non-null   int64 
 2   OSC_DATE          988 non-null    object
 3   INPUT_DATE        986 non-null    object
 4   COMP_DATE         922 non-null    object
 5   BASE_CITY_CODE    1000 non-null   object
 6   HEARING_LOC_CODE  1000 non-null   object
 7   DEC_CODE          717 non-null    object
 8   ABSENTIA          906 non-null    object
 9   CRIM_IND          973 non-null    object
 10  NAT               983 non-null    object
 11  LANG              995 non-null    object
 12  CASE_TYPE         1000 non-null   object
 13  CUSTODY           1000 non-null   object
 14  DATE_DETAINED     271 non-null    object
 15  DATE_RELEASED     124 non-null    object
dtypes: int64(2), object(14)
memory usage: 125.1+ KB


In [65]:
proceedings.dtypes

IDNPROCEEDING        int64
IDNCASE              int64
OSC_DATE            object
INPUT_DATE          object
COMP_DATE           object
BASE_CITY_CODE      object
HEARING_LOC_CODE    object
DEC_CODE            object
ABSENTIA            object
CRIM_IND            object
NAT                 object
LANG                object
CASE_TYPE           object
CUSTODY             object
DATE_DETAINED       object
DATE_RELEASED       object
dtype: object

#### Specifying Column Data Types

- `Int64`: Used for `IDNPROCEEDING`, `IDNCASE` to support nullable integer IDs.  
- `category`: Applied to repeated string fields for memory efficiency:
  `BASE_CITY_CODE`, `HEARING_LOC_CODE`, `DEC_CODE`, `ABSENTIA`, `CRIM_IND`,
  `NAT`, `LANG`, `CASE_TYPE`, `CUSTODY`.  
- `object`: Used for date fields (`OSC_DATE`, `INPUT_DATE`, `COMP_DATE`,
  `DATE_DETAINED`, `DATE_RELEASED`) to preserve original formats before validation
  and conversion to `datetime`.

In [66]:
dtypes = {
    "IDNPROCEEDING": "Int64",
    "IDNCASE": "Int64",
    "OSC_DATE": "object",
    "INPUT_DATE": "object",
    "COMP_DATE": "object",
    "BASE_CITY_CODE": "category",
    "HEARING_LOC_CODE": "category",
    "DEC_CODE": "category",
    "ABSENTIA": "category",
    "CRIM_IND": "category",
    "NAT": "category",
    "LANG": "category",
    "CASE_TYPE": "category",
    "CUSTODY": "category",
    "DATE_DETAINED": "object",
    "DATE_RELEASED": "object",
}

### Skipping Malformed Rows

Some lines in the dataset have an incorrect number of tab-separated fields, which can cause parsing errors during loading.  
The script first determines the expected number of columns from the header, then identifies and skips any rows that don't match.  
This approach ensures only properly structured records are read without triggering errors or discarding valid data.

In [97]:
import csv

with open(proceedings_path, "r", encoding="utf-8", errors="ignore") as f:
    header = f.readline().rstrip("\n").split("\t")
n_fields = len(header)

bad_rows = []
with open(proceedings_path, "r", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        if i == 0:
            continue  # skip header
        if len(line.rstrip("\n").split("\t")) != n_fields:
            bad_rows.append(i)

print("Rows to skip:", bad_rows)

Rows to skip: [117890, 563337, 563338, 722315, 794097, 938519, 1366545, 1373710, 1395626, 1511267, 1562922, 1649431, 2050527, 2170523, 2170524, 2838351, 2855408, 2859489, 2904629, 2904634, 3063925, 3103156, 3123548, 3126517, 3357244, 3357245, 3515658, 3574849, 3892279, 4044809, 4222977, 4222978, 4313871, 4407579, 4407580, 4470517, 4530822, 4596683, 4596684, 4650381, 4650382, 4849910, 5135396, 5351734, 5360366, 5529260, 5590946, 5764513, 5841776, 5855546, 5860447, 6191226, 6191227, 6426355, 6437813, 6791634, 6791635, 6973432, 7595760, 7595761, 7975619, 7975620, 8188518, 8188519, 8274309, 8392830, 8674736, 8736603, 8818448, 8830778, 8835154, 8901758, 9026644, 10005142, 10098223, 10209459, 10382229, 10396299, 10536022, 10663477, 10913832, 11001843, 11001844, 11057429, 11433037, 11433038, 11538809, 11538810, 11582439, 11606069, 11651505, 11665575, 11709931, 11729020, 11794080, 13676775, 13679075, 13679076, 13693685, 13693686, 14286701, 14286702, 14572463, 14572464, 14630377, 14732890, 1473

In [98]:
proceedings = pd.read_csv(
    proceedings_path,
    sep="\t",
    usecols=selected_columns,
    dtype=str,
    skiprows=bad_rows,
    quoting=csv.QUOTE_NONE,
)

In [99]:
proceedings = proceedings.astype(dtypes)

In [100]:
juvenile_proceedings = proceedings[
    proceedings["IDNPROCEEDING"].isin(juvenile_proceeding_ids)
].reset_index(drop=True)

In [101]:
print(
    f"Number of idnProceeding keys in tbl_JuvenileHistory.csv table: {len(juvenile_proceeding_ids):,}"
)
print(
    f"Number of matched juvenile rows in B_TblProceeding.csv based on those keys: {juvenile_proceedings.shape[0]:,}"
)

Number of idnProceeding keys in tbl_JuvenileHistory.csv table: 2,801,192
Number of matched juvenile rows in B_TblProceeding.csv based on those keys: 2,801,190


In [102]:
juvenile_proceedings.head()

Unnamed: 0,IDNPROCEEDING,IDNCASE,OSC_DATE,INPUT_DATE,BASE_CITY_CODE,HEARING_LOC_CODE,DEC_CODE,COMP_DATE,ABSENTIA,CUSTODY,CASE_TYPE,NAT,LANG,CRIM_IND,DATE_DETAINED,DATE_RELEASED
0,12885856,9825244,2021-03-13 00:00:00.000,2023-08-30 00:00:00.000,POR,H99,,,,N,RMV,HO,SP,N,,
1,12885874,12304962,2023-08-14 00:00:00.000,2023-08-30 00:00:00.000,NEW,NEW,,,,R,RMV,GT,SP,N,2023-08-14 00:00:00.000,2023-08-28 00:00:00.000
2,12885882,12397184,2023-08-18 00:00:00.000,2023-08-30 00:00:00.000,PSD,PSD,,2023-12-27 00:00:00.000,N,D,RMV,ES,SP,N,2023-08-18 00:00:00.000,
3,12885885,12369932,2023-08-18 00:00:00.000,2023-08-28 00:00:00.000,LVG,LVG,,2024-05-20 00:00:00.000,N,R,RMV,EC,SP,N,2023-08-25 00:00:00.000,2023-08-28 00:00:00.000
4,12885886,12198504,2023-07-06 00:00:00.000,2023-08-30 00:00:00.000,NYB,NYB,,2024-03-06 00:00:00.000,N,R,RMV,MR,AR,N,2023-07-06 00:00:00.000,2023-08-29 00:00:00.000


In [103]:
juvenile_proceedings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2801190 entries, 0 to 2801189
Data columns (total 16 columns):
 #   Column            Dtype   
---  ------            -----   
 0   IDNPROCEEDING     Int64   
 1   IDNCASE           Int64   
 2   OSC_DATE          object  
 3   INPUT_DATE        object  
 4   BASE_CITY_CODE    category
 5   HEARING_LOC_CODE  category
 6   DEC_CODE          category
 7   COMP_DATE         object  
 8   ABSENTIA          category
 9   CUSTODY           category
 10  CASE_TYPE         category
 11  NAT               category
 12  LANG              category
 13  CRIM_IND          category
 14  DATE_DETAINED     object  
 15  DATE_RELEASED     object  
dtypes: Int64(2), category(9), object(5)
memory usage: 187.1+ MB


#### Datetime Format Validation

The following date-related features will be the focus of the next stage of preprocessing:  
- `OSC_DATE`  
- `INPUT_DATE`  
- `COMP_DATE`  
- `DATE_DETAINED`  
- `DATE_RELEASED`  

These features may require format standardization and conversion to datetime objects to enable accurate temporal analysis.

Every datetime feature (except `C_BIRTHDATE`) follows the format `'YYYY-MM-DD 00:00:00.000'` (e.g., `'2025-02-04 00:00:00.000'`).

Before conversion, each feature will be tested against this pattern to ensure values are valid.  
All non-null entries will be checked to avoid unintended data loss during transformation with `pd.to_datetime()`.

Only the **`YYYY-MM-DD`** portion of each timestamp will be retained.

In [104]:
invalid_osc_date = find_invalid_dates(juvenile_proceedings, "OSC_DATE")
invalid_input_date = find_invalid_dates(juvenile_proceedings, "INPUT_DATE")
invalid_comp_date = find_invalid_dates(juvenile_proceedings, "COMP_DATE")
invalid_date_detained = find_invalid_dates(juvenile_proceedings, "DATE_DETAINED")
invalid_date_released = find_invalid_dates(juvenile_proceedings, "DATE_RELEASED")

In [105]:
report_invalid("OSC_DATE", invalid_osc_date)
report_invalid("INPUT_DATE", invalid_input_date)
report_invalid("COMP_DATE", invalid_comp_date)
report_invalid("DATE_DETAINED", invalid_date_detained)
report_invalid("DATE_RELEASED", invalid_date_released)

OSC_DATE: 0 invalid entries
INPUT_DATE: 0 invalid entries
COMP_DATE: 0 invalid entries
DATE_DETAINED: 0 invalid entries
DATE_RELEASED: 0 invalid entries


In [106]:
date_cols = ["OSC_DATE", "INPUT_DATE", "COMP_DATE", "DATE_DETAINED", "DATE_RELEASED"]

juvenile_proceedings[date_cols] = juvenile_proceedings[date_cols].apply(
    lambda col: pd.to_datetime(col, errors="coerce")
)

In [107]:
juvenile_proceedings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2801190 entries, 0 to 2801189
Data columns (total 16 columns):
 #   Column            Dtype         
---  ------            -----         
 0   IDNPROCEEDING     Int64         
 1   IDNCASE           Int64         
 2   OSC_DATE          datetime64[ns]
 3   INPUT_DATE        datetime64[ns]
 4   BASE_CITY_CODE    category      
 5   HEARING_LOC_CODE  category      
 6   DEC_CODE          category      
 7   COMP_DATE         datetime64[ns]
 8   ABSENTIA          category      
 9   CUSTODY           category      
 10  CASE_TYPE         category      
 11  NAT               category      
 12  LANG              category      
 13  CRIM_IND          category      
 14  DATE_DETAINED     datetime64[ns]
 15  DATE_RELEASED     datetime64[ns]
dtypes: Int64(2), category(9), datetime64[ns](5)
memory usage: 187.1 MB


In [108]:
null_counts = juvenile_proceedings.isna().sum()
percent_missing = (null_counts / len(juvenile_proceedings)) * 100

missing_summary = pd.DataFrame(
    {"Missing Count": null_counts, "Missing %": percent_missing.round(2)}
).sort_values(by="Missing Count", ascending=False)

display(missing_summary)

Unnamed: 0,Missing Count,Missing %
DATE_RELEASED,1783737,63.68
DATE_DETAINED,1335076,47.66
DEC_CODE,1331104,47.52
ABSENTIA,458485,16.37
COMP_DATE,436265,15.57
CRIM_IND,170702,6.09
OSC_DATE,22470,0.8
INPUT_DATE,10757,0.38
NAT,7927,0.28
LANG,1759,0.06


### Categorical Value Cleanup

Currently reviewing categorical columns to identify and clean inconsistent or invalid entries.  
Suspicious values (e.g., empty strings, control characters, or rare invalid codes) will be converted to `NaN` for clarity and downstream compatibility.

Target columns:
- BASE_CITY_CODE
- HEARING_LOC_CODE
- DEC_CODE
- ABSENTIA
- CUSTODY
- CASE_TYPE
- NAT
- LANG
- CRIM_IND

This step ensures categorical data is standardized and ready for analysis.

In [None]:
juvenile_proceedings["BASE_CITY_CODE"].value_counts()

BASE_CITY_CODE
MIA               130120
NYC               121116
SFR               104828
SNA               102935
DAL                95197
                   ...  
FCI                    0
ELC                    0
TST                    0
BDC                    0
NYD                    0
Name: count, Length: 88, dtype: int64

In [None]:
juvenile_proceedings["HEARING_LOC_CODE"].value_counts()

HEARING_LOC_CODE
MIA                 122951
NYC                 117270
SFR                  82003
DAL                  75773
CHL                  74774
                     ...  
EPZ                      0
EPO                      0
EOL                      0
RA3                      0
1AC                      0
Name: count, Length: 818, dtype: int64

In [111]:
display(
    juvenile_proceedings[
        juvenile_proceedings["CASE_TYPE"].notna()
        & juvenile_proceedings["DEC_CODE"].notna()
    ][["CASE_TYPE", "DEC_CODE"]]
)

Unnamed: 0,CASE_TYPE,DEC_CODE
5,CFR,A
6,RMV,X
7,RMV,T
9,RMV,X
11,RMV,X
...,...,...
2801156,RMV,X
2801158,RMV,X
2801176,RMV,T
2801179,CFR,A


In [None]:
juvenile_proceedings["DEC_CODE"].value_counts()

DEC_CODE
X           696714
U           284055
T           179644
V           104375
A            83552
R            80902
             14761
D            11646
W             5351
G             2807
O             2087
Z             1417
E             1410
L              953
H              337
J               67
S                4
C                3
K                1
2                0
Name: count, dtype: int64

In [None]:
juvenile_proceedings["DEC_CODE"] = juvenile_proceedings["DEC_CODE"].replace("2", np.nan)

In [None]:
juvenile_proceedings["ABSENTIA"].value_counts()

ABSENTIA
N           1944976
Y            397715
                 14
5                 0
Name: count, dtype: int64

In [None]:
juvenile_proceedings["ABSENTIA"] = juvenile_proceedings["ABSENTIA"].replace(
    {"": np.nan, "5": np.nan}
)

In [None]:
juvenile_proceedings["CUSTODY"].value_counts()

CUSTODY
N          1253877
R          1071870
D           475427
                1
                0
                0
                0
C                0
Name: count, dtype: int64

In [117]:
valid_custody = ["N", "R", "D"]
juvenile_proceedings["CUSTODY"] = juvenile_proceedings["CUSTODY"].where(
    juvenile_proceedings["CUSTODY"].isin(valid_custody), np.nan
)

In [118]:
juvenile_proceedings[["CRIM_IND"]].value_counts()

CRIM_IND
N           2572330
Y             58142
                 16
Name: count, dtype: int64

In [119]:
juvenile_proceedings["CRIM_IND"] = juvenile_proceedings["CRIM_IND"].replace(
    r"^\s*$", np.nan, regex=True
)

In [123]:
juvenile_proceedings["LANG"].value_counts()

LANG
SP     2212406
ENG     120922
POR      62403
PUN      48400
SP       29962
        ...   
BMY          0
BMO          0
BLE          0
FAA          0
GA           0
Name: count, Length: 561, dtype: int64

In [124]:
juvenile_proceedings["NAT"].value_counts()

NAT
GT    595840
HO    530506
MX    368362
ES    332344
CU    108914
       ...  
PC         0
PF         0
RQ         0
SB         0
b6         0
Name: count, Length: 264, dtype: int64

Replaced invalid values with `NaN` and removed unused categories for cleaner analysis.

In [127]:
categorical_cols = [
    "BASE_CITY_CODE",
    "HEARING_LOC_CODE",
    "DEC_CODE",
    "ABSENTIA",
    "CUSTODY",
    "CASE_TYPE",
    "NAT",
    "LANG",
    "CRIM_IND",
]

for col in categorical_cols:
    if isinstance(juvenile_proceedings[col].dtype, pd.CategoricalDtype):
        juvenile_proceedings[col] = juvenile_proceedings[
            col
        ].cat.remove_unused_categories()

In [129]:
juvenile_proceedings.duplicated().sum()

0

In [130]:
juvenile_proceedings.head()

Unnamed: 0,IDNPROCEEDING,IDNCASE,OSC_DATE,INPUT_DATE,BASE_CITY_CODE,HEARING_LOC_CODE,DEC_CODE,COMP_DATE,ABSENTIA,CUSTODY,CASE_TYPE,NAT,LANG,CRIM_IND,DATE_DETAINED,DATE_RELEASED
0,12885856,9825244,2021-03-13,2023-08-30,POR,H99,,NaT,,N,RMV,HO,SP,N,NaT,NaT
1,12885874,12304962,2023-08-14,2023-08-30,NEW,NEW,,NaT,,R,RMV,GT,SP,N,2023-08-14,2023-08-28
2,12885882,12397184,2023-08-18,2023-08-30,PSD,PSD,,2023-12-27,N,D,RMV,ES,SP,N,2023-08-18,NaT
3,12885885,12369932,2023-08-18,2023-08-28,LVG,LVG,,2024-05-20,N,R,RMV,EC,SP,N,2023-08-25,2023-08-28
4,12885886,12198504,2023-07-06,2023-08-30,NYB,NYB,,2024-03-06,N,R,RMV,MR,AR,N,2023-07-06,2023-08-29


In [131]:
juvenile_proceedings.to_csv(
    "../outputs/juvenile_proceedings_cleaned.csv.gz", index=False, compression="gzip"
)