In [1]:
#import dependencies 
from path import Path
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt
from scipy import stats
import matplotlib.pyplot as plt

In [2]:
#import data
file_path = Path("./Resources/cbp_data.csv")
cbp_df = pd.read_csv(file_path, low_memory=False)
cbp_df

Unnamed: 0,date_in,date_out,app_date,hours_in_custody,age_group,gender,citizenship,border,sector,field_office,source
0,2017-01-20 00:10:00,2017-01-20 10:08:00,,9.950000,6-8 years,Female,EL SALVADOR,SBO,(b)(7)(E ),,BP
1,2017-01-20 00:15:00,2017-01-24 17:30:00,,113.233333,3-5 years,Female,GUATEMALA,SBO,(b)(7)(E ),,BP
2,2017-01-20 00:22:00,2017-01-24 17:47:00,,113.416667,3-5 years,Female,BRAZIL,SBO,(b)(7)(E ),,BP
3,2017-01-20 00:30:00,2017-01-21 06:35:00,,30.083333,12-14 years,Male,EL SALVADOR,SBO,(b)(7)(E ),,BP
4,2017-01-20 00:30:00,2017-01-21 13:03:00,,36.533333,3-5 years,Male,HONDURAS,SBO,(b)(7)(E ),,BP
...,...,...,...,...,...,...,...,...,...,...,...
583803,2020-06-18 14:00:55,2020-06-18 18:47:00,2020-06-18 16:03:56,4.770000,15-18 years,M - MALE,MEXICO,,,LAREDO,OFO
583804,2020-06-18 16:07:39,2020-06-19 13:45:00,2020-06-18 16:24:08,21.620000,15-18 years,M - MALE,MEXICO,,,SAN DIEGO,OFO
583805,2020-06-19 09:34:16,2020-06-19 12:23:00,2020-06-19 09:45:22,2.810000,12-14 years,F - FEMALE,MEXICO,,,SAN DIEGO,OFO
583806,2020-06-19 14:40:00,2020-06-20 11:25:45,2020-06-19 12:43:29,20.760000,15-18 years,F - FEMALE,MEXICO,,,SAN DIEGO,OFO


In [5]:
#remove initial columns I don't need
cbp_df = cbp_df.drop(['app_date', 'sector', 'field_office', 'source'], axis=1).dropna()
cbp_df
#ends up dropping more because I kept the border column, which has more null values

KeyError: "['app_date' 'sector' 'field_office' 'source'] not found in axis"

In [6]:
#Clean gender column

#create dictionary 
genders = {'M - MALE': 'Male', 'F - FEMALE': 'Female', 'U - UNKNOWN': 'Unknown'}

#replace values in column
cbp_df['gender'] = cbp_df.gender.replace(genders)

In [7]:
#round hours in custody 
cbp_df['hours_in_custody'] = cbp_df.hours_in_custody.round(2)
cbp_df

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship,border
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.95,6-8 years,Female,EL SALVADOR,SBO
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.23,3-5 years,Female,GUATEMALA,SBO
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.42,3-5 years,Female,BRAZIL,SBO
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.08,12-14 years,Male,EL SALVADOR,SBO
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.53,3-5 years,Male,HONDURAS,SBO
...,...,...,...,...,...,...,...
488813,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,6-8 years,Female,VENEZUELA,SBO
488814,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO
488815,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,9-11 years,Male,VENEZUELA,SBO
488816,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO


In [8]:
# Convert Hours in custody into days 
cbp_df['days_in_custody'] = cbp_df['hours_in_custody'] / 24
cbp_df

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship,border,days_in_custody
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.95,6-8 years,Female,EL SALVADOR,SBO,0.414583
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.23,3-5 years,Female,GUATEMALA,SBO,4.717917
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.42,3-5 years,Female,BRAZIL,SBO,4.725833
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.08,12-14 years,Male,EL SALVADOR,SBO,1.253333
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.53,3-5 years,Male,HONDURAS,SBO,1.522083
...,...,...,...,...,...,...,...,...
488813,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,6-8 years,Female,VENEZUELA,SBO,1.385417
488814,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.385417
488815,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,9-11 years,Male,VENEZUELA,SBO,1.385417
488816,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.385417


In [9]:
#round hours in custody 
cbp_df['days_in_custody'] = cbp_df.days_in_custody.round(2)
cbp_df

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship,border,days_in_custody
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.95,6-8 years,Female,EL SALVADOR,SBO,0.41
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.23,3-5 years,Female,GUATEMALA,SBO,4.72
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.42,3-5 years,Female,BRAZIL,SBO,4.73
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.08,12-14 years,Male,EL SALVADOR,SBO,1.25
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.53,3-5 years,Male,HONDURAS,SBO,1.52
...,...,...,...,...,...,...,...,...
488813,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,6-8 years,Female,VENEZUELA,SBO,1.39
488814,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.39
488815,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,9-11 years,Male,VENEZUELA,SBO,1.39
488816,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.39


In [10]:
#drop negative values 
cbp_df.drop(cbp_df.loc[cbp_df['hours_in_custody']<0].index, inplace=True)
cbp_df

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship,border,days_in_custody
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.95,6-8 years,Female,EL SALVADOR,SBO,0.41
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.23,3-5 years,Female,GUATEMALA,SBO,4.72
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.42,3-5 years,Female,BRAZIL,SBO,4.73
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.08,12-14 years,Male,EL SALVADOR,SBO,1.25
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.53,3-5 years,Male,HONDURAS,SBO,1.52
...,...,...,...,...,...,...,...,...
488813,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,6-8 years,Female,VENEZUELA,SBO,1.39
488814,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.39
488815,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,9-11 years,Male,VENEZUELA,SBO,1.39
488816,2020-01-31 23:50:00,2020-02-02 09:06:00,33.25,12-14 years,Female,VENEZUELA,SBO,1.39


In [11]:
cbp_df.hours_in_custody.describe()

count    487761.000000
mean         69.592357
std          54.717287
min           0.000000
25%          35.880000
50%          58.930000
75%          89.220000
max         991.220000
Name: hours_in_custody, dtype: float64

In [12]:
# Export to csv
cbp_df.to_csv(r'/Users/ebonybrown/Desktop/My_Projects/CBP_Detentions/Resources/cbp_tab.csv', index_label="unique_id")
