In [2]:
#import dependencies 
from path import Path
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
import psycopg2
import time
from datetime import datetime as dt

In [3]:
#import data
file_path = Path("./Resources/cbp_data.csv")
cbp_df = pd.read_csv(file_path, low_memory=False)
cbp_df.head()

Unnamed: 0,date_in,date_out,app_date,hours_in_custody,age_group,gender,citizenship,border,sector,field_office,source
0,2017-01-20 00:10:00,2017-01-20 10:08:00,,9.95,6-8 years,Female,EL SALVADOR,SBO,(b)(7)(E ),,BP
1,2017-01-20 00:15:00,2017-01-24 17:30:00,,113.233333,3-5 years,Female,GUATEMALA,SBO,(b)(7)(E ),,BP
2,2017-01-20 00:22:00,2017-01-24 17:47:00,,113.416667,3-5 years,Female,BRAZIL,SBO,(b)(7)(E ),,BP
3,2017-01-20 00:30:00,2017-01-21 06:35:00,,30.083333,12-14 years,Male,EL SALVADOR,SBO,(b)(7)(E ),,BP
4,2017-01-20 00:30:00,2017-01-21 13:03:00,,36.533333,3-5 years,Male,HONDURAS,SBO,(b)(7)(E ),,BP


In [4]:
#remove initial columns I don't need
cbp_df = cbp_df.drop(['app_date', 'border', 'sector', 'field_office', 'source'], axis=1).dropna()
cbp_df.head()

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.95,6-8 years,Female,EL SALVADOR
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.233333,3-5 years,Female,GUATEMALA
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.416667,3-5 years,Female,BRAZIL
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.083333,12-14 years,Male,EL SALVADOR
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.533333,3-5 years,Male,HONDURAS


In [5]:
#Format date columns

# Break down months/years of detentions
cbp_df["year_in"] = pd.to_datetime(cbp_df["date_in"]).dt.year
cbp_df["month_in"] = pd.to_datetime(cbp_df["date_in"]).dt.month
cbp_df["year_out"] = pd.to_datetime(cbp_df["date_out"]).dt.year
cbp_df["month_out"] = pd.to_datetime(cbp_df["date_out"]).dt.month
cbp_df

Unnamed: 0,date_in,date_out,hours_in_custody,age_group,gender,citizenship,year_in,month_in,year_out,month_out
0,2017-01-20 00:10:00,2017-01-20 10:08:00,9.950000,6-8 years,Female,EL SALVADOR,2017,1,2017,1
1,2017-01-20 00:15:00,2017-01-24 17:30:00,113.233333,3-5 years,Female,GUATEMALA,2017,1,2017,1
2,2017-01-20 00:22:00,2017-01-24 17:47:00,113.416667,3-5 years,Female,BRAZIL,2017,1,2017,1
3,2017-01-20 00:30:00,2017-01-21 06:35:00,30.083333,12-14 years,Male,EL SALVADOR,2017,1,2017,1
4,2017-01-20 00:30:00,2017-01-21 13:03:00,36.533333,3-5 years,Male,HONDURAS,2017,1,2017,1
...,...,...,...,...,...,...,...,...,...,...
583803,2020-06-18 14:00:55,2020-06-18 18:47:00,4.770000,15-18 years,M - MALE,MEXICO,2020,6,2020,6
583804,2020-06-18 16:07:39,2020-06-19 13:45:00,21.620000,15-18 years,M - MALE,MEXICO,2020,6,2020,6
583805,2020-06-19 09:34:16,2020-06-19 12:23:00,2.810000,12-14 years,F - FEMALE,MEXICO,2020,6,2020,6
583806,2020-06-19 14:40:00,2020-06-20 11:25:45,20.760000,15-18 years,F - FEMALE,MEXICO,2020,6,2020,6


In [6]:
# Drop original date in/out columns
cbp_df = cbp_df.drop(['date_in', 'date_out'], axis=1)
cbp_df

Unnamed: 0,hours_in_custody,age_group,gender,citizenship,year_in,month_in,year_out,month_out
0,9.950000,6-8 years,Female,EL SALVADOR,2017,1,2017,1
1,113.233333,3-5 years,Female,GUATEMALA,2017,1,2017,1
2,113.416667,3-5 years,Female,BRAZIL,2017,1,2017,1
3,30.083333,12-14 years,Male,EL SALVADOR,2017,1,2017,1
4,36.533333,3-5 years,Male,HONDURAS,2017,1,2017,1
...,...,...,...,...,...,...,...,...
583803,4.770000,15-18 years,M - MALE,MEXICO,2020,6,2020,6
583804,21.620000,15-18 years,M - MALE,MEXICO,2020,6,2020,6
583805,2.810000,12-14 years,F - FEMALE,MEXICO,2020,6,2020,6
583806,20.760000,15-18 years,F - FEMALE,MEXICO,2020,6,2020,6


In [7]:
#Clean gender column

#create dictionary 
genders = {'M - MALE': 'Male', 'F - FEMALE': 'Female', 'U - UNKNOWN': 'Unknown'}

#replace values in column
cbp_df['gender'] = cbp_df.gender.replace(genders)

In [8]:
#round hours in custody 
cbp_df['hours_in_custody'] = cbp_df.hours_in_custody.round(2)
cbp_df

Unnamed: 0,hours_in_custody,age_group,gender,citizenship,year_in,month_in,year_out,month_out
0,9.95,6-8 years,Female,EL SALVADOR,2017,1,2017,1
1,113.23,3-5 years,Female,GUATEMALA,2017,1,2017,1
2,113.42,3-5 years,Female,BRAZIL,2017,1,2017,1
3,30.08,12-14 years,Male,EL SALVADOR,2017,1,2017,1
4,36.53,3-5 years,Male,HONDURAS,2017,1,2017,1
...,...,...,...,...,...,...,...,...
583803,4.77,15-18 years,Male,MEXICO,2020,6,2020,6
583804,21.62,15-18 years,Male,MEXICO,2020,6,2020,6
583805,2.81,12-14 years,Female,MEXICO,2020,6,2020,6
583806,20.76,15-18 years,Female,MEXICO,2020,6,2020,6


In [9]:
# Export
cbp_df.to_csv(r'/Users/ebonybrown/Desktop/My_Projects/CBP_Detentions/Resources/cbp_clean.csv')