# Juvenile Immigration Case Analysis – Data Exploration
This notebook analyzes a large EOIR history dataset to explore patterns in juvenile immigration cases.

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
df = pd.read_csv("YourFilePath.csv")
df.info()


In [None]:
total_juvenile_rows = df[df["idnJuvenile"] == 1.0].shape[0]
unique_juvenile_cases = df[df["idnJuvenile"] == 1.0]["idnCase"].nunique()
total_cases = df["idnCase"].nunique()
juvenile_ratio = round((unique_juvenile_cases / total_cases) * 100, 2)
print(f"Total juvenile rows: {total_juvenile_rows}")
print(f"Unique juvenile cases: {unique_juvenile_cases}")
print(f"Total cases: {total_cases}")
print(f"Juvenile case ratio: {juvenile_ratio}%")

In [None]:
proceedings_per_case = df.groupby("idnCase")["idnProceeding"].nunique()
proceedings_per_case.describe()

In [None]:
df.isnull().sum()

In [None]:
duplicates = df.duplicated(subset=["idnCase", "idnProceeding"]).sum()
print(f"Duplicate rows: {duplicates}")

In [None]:
juvenile_df = df[df["idnJuvenile"] == 1.0].copy()
os.makedirs("output_data", exist_ok=True)
juvenile_df.to_csv("output_data/juvenile_cases_only.csv", index=False)

In [None]:
proceedings_per_case.hist(bins=20, figsize=(8, 5))
plt.title("Distribution of Proceedings per Case")
plt.xlabel("Number of Proceedings")
plt.ylabel("Number of Cases")
plt.grid(True)
plt.show()

In [None]:
sizes = [unique_juvenile_cases, total_cases - unique_juvenile_cases]
labels = ["Juvenile Cases", "Non-Juvenile Cases"]
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
plt.axis("equal")
plt.title("Proportion of Juvenile vs Non-Juvenile Cases")
plt.show()

### Conclusion
- Juvenile cases make up around 6.6% of total cases.
- Most cases involve 1–2 proceedings, with a few going up to 16.
- There are data quality issues, including duplicates and missing values, that must be considered in downstream analysis.