In [None]:
#%%

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X_train_data = pd.read_csv("law_data.csv")
y_train_data = X_train_data.pop("first_pf")

X_train_data = pd.get_dummies(data=X_train_data)
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.3, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(pd.DataFrame(classification_report(y_test, predictions, output_dict=True)).T.to_markdown())

|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0.0          |    0.538217 | 0.246715 |   0.338338 |  685        |
| 1.0          |    0.917095 | 0.975226 |   0.945268 | 5853        |
| accuracy     |    0.898899 | 0.898899 |   0.898899 |    0.898899 |
| macro avg    |    0.727656 | 0.610971 |   0.641803 | 6538        |
| weighted avg |    0.877399 | 0.898899 |   0.881679 | 6538        |


In [None]:
# %%

# Prediction per sex
sex = X_test.groupby("sex")
for name, groups in sex:
    pred = model.predict(groups)
    print("\n", name, groups.shape[0])
    print(pd.DataFrame(classification_report(y_test.loc[groups.index], pred, output_dict=True)).T.to_markdown())


 1 2894
|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0.0          |    0.516854 | 0.282209 |   0.365079 |  326        |
| 1.0          |    0.913844 | 0.966511 |   0.93944  | 2568        |
| accuracy     |    0.889426 | 0.889426 |   0.889426 |    0.889426 |
| macro avg    |    0.715349 | 0.62436  |   0.65226  | 2894        |
| weighted avg |    0.869124 | 0.889426 |   0.87474  | 2894        |

 2 3644
|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0.0          |    0.566176 | 0.214485 |   0.311111 |  359        |
| 1.0          |    0.919612 | 0.98204  |   0.949801 | 3285        |
| accuracy     |    0.906422 | 0.906422 |   0.906422 |    0.906422 |
| macro avg    |    0.742894 | 0.598262 |   0.630456 | 3644        |
| weighted avg |    0.884792 | 0.906422 |   0.886879 | 3644        |


In [None]:
# %%

# Prediction per ethnicity
ethnicities = ["Amerindian", "Asian", "Black", "Hispanic", "Mexican", "Other", "Puertorican", "White"]
# print(X_test)
for ethnicity in ethnicities:
    group = X_test.groupby("race_"+ethnicity)
    for name, groups in group:
        if name == True:
            pred = model.predict(groups)
            print("\n", ethnicity, groups.shape[0])
            print(pd.DataFrame(classification_report(y_test.loc[groups.index], pred, output_dict=True)).T.to_markdown())


 Amerindian 28
|              |   precision |   recall |   f1-score |   support |
|:-------------|------------:|---------:|-----------:|----------:|
| 0.0          |    0.7      | 0.7      |   0.7      | 10        |
| 1.0          |    0.833333 | 0.833333 |   0.833333 | 18        |
| accuracy     |    0.785714 | 0.785714 |   0.785714 |  0.785714 |
| macro avg    |    0.766667 | 0.766667 |   0.766667 | 28        |
| weighted avg |    0.785714 | 0.785714 |   0.785714 | 28        |

 Asian 261
|              |   precision |   recall |   f1-score |    support |
|:-------------|------------:|---------:|-----------:|-----------:|
| 0.0          |    0.583333 | 0.152174 |   0.241379 |  46        |
| 1.0          |    0.843373 | 0.976744 |   0.905172 | 215        |
| accuracy     |    0.831418 | 0.831418 |   0.831418 |   0.831418 |
| macro avg    |    0.713353 | 0.564459 |   0.573276 | 261        |
| weighted avg |    0.797543 | 0.831418 |   0.788182 | 261        |

 Black 402
|              

In [None]:
# %%

# Prediction per region
regions = ["FW","GL","MS","MW","Mt","NE","NG","NW","PO","SC","SE"]
for region in regions:
    group = X_test.groupby("region_first_"+region)
    for name, groups in group:
        if name == True:
            pred = model.predict(groups)
            print("\n", region, groups.shape[0])
            print(pd.DataFrame(classification_report(y_test.loc[groups.index], pred, output_dict=True)).T.to_markdown())


 FW 905
|              |   precision |   recall |   f1-score |    support |
|:-------------|------------:|---------:|-----------:|-----------:|
| 0.0          |    0.574074 | 0.219858 |   0.317949 | 141        |
| 1.0          |    0.87074  | 0.969895 |   0.917647 | 764        |
| accuracy     |    0.853039 | 0.853039 |   0.853039 |   0.853039 |
| macro avg    |    0.722407 | 0.594877 |   0.617798 | 905        |
| weighted avg |    0.824519 | 0.853039 |   0.824213 | 905        |

 GL 1131
|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0.0          |    0.789474 | 0.189873 |   0.306122 |   79        |
| 1.0          |    0.942446 | 0.996198 |   0.968577 | 1052        |
| accuracy     |    0.939876 | 0.939876 |   0.939876 |    0.939876 |
| macro avg    |    0.86596  | 0.593036 |   0.63735  | 1131        |
| weighted avg |    0.931761 | 0.939876 |   0.922304 | 1131        |

 MS 701
|            