# Cargo Theft Data Pre-Processing

## Import and Check Out the Data

In [1]:
# Matplotlib inline magic
%matplotlib inline

# Import the necessary dependencies
import pandas as pd
import numpy as np
import matplotlib as plt
import json

In [2]:
# Read the CSV file into a DataFrame.
raw_df = pd.read_csv("Resources/CT_2013_2021.csv")
raw_df.head()

Unnamed: 0,data_year,ori,pub_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,...,weapon_code,weapon_name,prop_desc_code,data_year.1,prop_desc_code.1,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
0,2021,TX0030000,Angelina,,County,TX,Texas,West South Central,ANGELINA,South,...,,,20,2021,20,Money,375.0,0,False,
1,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,
2,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,65,2021,65,Identity Documents,0.0,0,False,
3,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,13,2021,13,Firearms,320.0,0,False,
4,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,


In [3]:
# What are the columns in this DataFrame?
raw_df.dtypes

data_year                  int64
ori                       object
pub_agency_name           object
pub_agency_unit           object
agency_type_name          object
state_abbr                object
state_name                object
division_name             object
county_name               object
region_name               object
population_group_code     object
population_group_desc     object
offense_code              object
offense_name              object
offender_race             object
offender_ethnicity        object
offender_age             float64
offender_sex              object
victim_type_code          object
victim_type_name          object
location_code              int64
location_name             object
weapon_code               object
weapon_name               object
prop_desc_code             int64
data_year.1                int64
prop_desc_code.1           int64
prop_desc_name            object
stolen_value             float64
recovered_value            int64
recovered_

In [4]:
# How many different values are in each column?
raw_df.nunique()

data_year                   9
ori                      3912
pub_agency_name          2935
pub_agency_unit           147
agency_type_name            8
state_abbr                 50
state_name                 50
division_name              10
county_name              1227
region_name                 5
population_group_code      19
population_group_desc      19
offense_code               40
offense_name               40
offender_race               6
offender_ethnicity          4
offender_age               87
offender_sex                3
victim_type_code            9
victim_type_name            9
location_code              47
location_name              47
weapon_code                19
weapon_name                19
prop_desc_code             68
data_year.1                 9
prop_desc_code.1           68
prop_desc_name             68
stolen_value             5265
recovered_value          1144
recovered_flag              2
date_recovered           2431
dtype: int64

In [5]:
# Are there null values?
# (Null/NaN values are not in value counts)
raw_df.isna().sum()

data_year                     0
ori                           0
pub_agency_name               0
pub_agency_unit          150070
agency_type_name              0
state_abbr                    0
state_name                    0
division_name                 0
county_name                   0
region_name                   0
population_group_code         6
population_group_desc         6
offense_code                  0
offense_name                  0
offender_race             68219
offender_ethnicity        68219
offender_age              72765
offender_sex              68219
victim_type_code              0
victim_type_name              0
location_code                 0
location_name                 0
weapon_code              138505
weapon_name              138505
prop_desc_code                0
data_year.1                   0
prop_desc_code.1              0
prop_desc_name                0
stolen_value                271
recovered_value               0
recovered_flag                0
date_rec

In [6]:
# Some columns look similar. Are they clones?
print(raw_df["data_year"].equals(raw_df["data_year.1"]))
print(raw_df["prop_desc_code"].equals(raw_df["prop_desc_code.1"]))

True
True


In [7]:
# First transformation plan - Drop the unneccessary columns.
# These are the identifier, the clones, and the code columns
# (name and descriptions were already included).
dropped = ["ori", "data_year.1", "prop_desc_code.1", "population_group_code", "offense_code",
           "victim_type_code", "location_code", "weapon_code", "prop_desc_code"]

In [8]:
# Execute first transformation.
df0 = raw_df.drop(columns = dropped, axis = 1)
df0.head()

Unnamed: 0,data_year,pub_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,population_group_desc,...,offender_age,offender_sex,victim_type_name,location_name,weapon_name,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
0,2021,Angelina,,County,TX,Texas,West South Central,ANGELINA,South,"Non-MSA counties from 25,000 thru 99,999",...,,,Individual,Other/Unknown,,Money,375.0,0,False,
1,2021,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,"MSA counties 100,000 or over",...,0.0,U,Individual,Residence/Home,,Other,1.0,0,False,
2,2021,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,"MSA counties 100,000 or over",...,0.0,U,Individual,Residence/Home,,Identity Documents,0.0,0,False,
3,2021,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,"MSA counties 100,000 or over",...,0.0,U,Individual,Residence/Home,,Firearms,320.0,0,False,
4,2021,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,"MSA counties 100,000 or over",...,0.0,U,Individual,Residence/Home,,Other,1.0,0,False,


## Investigate the Null Values
### What are the non-null values in these columns?

In [9]:
# Examine the 'public_agency_unit' column.
df0.pub_agency_unit.value_counts()

New Castle County    119
Keyser               109
Princeton            103
Lake Norman           75
Lake James            60
                    ... 
Gibson County          1
Hamilton County        1
Grayson County         1
Crawford County        1
Berkeley County        1
Name: pub_agency_unit, Length: 147, dtype: int64

In [10]:
# Examine the 'population_group_desc' column.
df0.population_group_desc.value_counts()

Cities from 500,000 thru 999,999            20448
Cities from 25,000 thru 49,999              16711
Cities from 10,000 thru 24,999              16546
MSA counties 100,000 or over                16005
Cities from 50,000 thru 99,999              13123
Cities from 100,000 thru 249,999            12959
Cities from 2,500 thru 9,999                10181
Cities from 250,000 thru 499,999            10016
MSA counties from 25,000 thru 99,999         9178
Cities 1,000,000 or over                     7178
Non-MSA counties from 25,000 thru 99,999     5477
Non-MSA counties from 10,000 thru 24,999     4442
Cities under 2,500                           4330
Non-MSA counties under 10,000                1884
MSA counties from 10,000 thru 24,999         1749
MSA counties under 10,000                    1195
Non-MSA State Police                           58
Non-MSA counties 100,000 or over                4
MSA State Police                                2
Name: population_group_desc, dtype: int64

In [11]:
# Examine the 'offender_race' column.
df0.offender_race.value_counts()

White                                        34179
Black or African American                    24209
Unknown                                      23878
American Indian or Alaska Native               711
Asian                                          277
Native Hawaiian or Other Pacific Islander       19
Name: offender_race, dtype: int64

In [12]:
# Examine the 'offender_ethnicity' column.
df0.offender_ethnicity.value_counts()

Not Hispanic or Latino    31554
Not Specified             23923
Unknown                   23702
Hispanic or Latino         4094
Name: offender_ethnicity, dtype: int64

In [13]:
# Examine the 'offender_age' column.
df0.offender_age.value_counts().sort_index()

0.0     19106
1.0       170
4.0         4
5.0         7
6.0         3
        ...  
85.0        1
86.0        3
87.0        3
98.0       20
99.0      869
Name: offender_age, Length: 87, dtype: int64

In [14]:
# Examine the 'offender_sex' column.
df0.offender_sex.value_counts()

M    49186
U    20894
F    13193
Name: offender_sex, dtype: int64

In [15]:
# Examine the 'weapon_name' column.
df0.weapon_name.value_counts()

Handgun                           4369
Personal Weapons                  3117
None                              1677
Other                              715
Knife/Cutting Instrument           710
Firearm                            679
Unknown                            460
Handgun (Automatic)                332
Blunt Object                       229
Rifle                              214
Motor Vehicle/Vessel               174
Other Firearm                       89
Firearm (Automatic)                 88
Shotgun                             79
Asphyxiation                        22
Drugs/Narcotics/Sleeping Pills      18
Rifle (Automatic)                   11
Shotgun (Automatic)                  2
Poison                               2
Name: weapon_name, dtype: int64

In [16]:
# Examine the 'stolen_value' column.
df0.stolen_value.value_counts().sort_index()

0.0            14676
1.0            20088
2.0             1346
3.0              510
4.0              322
               ...  
25000000.0         6
25496498.0         5
120029400.0        1
250000000.0        1
311128346.0        2
Name: stolen_value, Length: 5265, dtype: int64

In [17]:
# Examine the 'date_recoverd' column.
df0.date_recovered.value_counts()

2019-10-09    127
2016-10-07    121
2020-09-09    108
2020-12-28    104
2020-08-19     97
             ... 
2016-03-25      1
2016-10-31      1
2016-03-07      1
2016-02-08      1
2013-03-11      1
Name: date_recovered, Length: 2431, dtype: int64

In [18]:
# Second transformation - fix null values.
# EASY - Set categorical nulls to "Unknown."
df0.offender_race = df0.offender_race.fillna("Unknown")
df0.offender_ethnicity = df0.offender_ethnicity.fillna("Unknown")
df0.offender_sex = df0.offender_sex.fillna("U")
df0.weapon_name = df0.weapon_name.fillna("Unknown")

# When handling 'offender_age', assume 'age 0' is the
# blanket bin for undefined ages.
df0.offender_age = df0.offender_age.fillna(0.0)

# When handling 'stolen_value', assume that a value of '0.0'
# is the blanket bit for undefined values.
df0.stolen_value = df0.stolen_value.fillna(0.0)

The `population_group_description` column does not look like it will work with "Unknown" as a value, so take a closer look at this area.

In [19]:
# Take a closer look at the entries with undefined population_group_description values.
df0.loc[df0.population_group_desc.isna() == True]

Unnamed: 0,data_year,pub_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,population_group_desc,...,offender_age,offender_sex,victim_type_name,location_name,weapon_name,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
2057,2021,Federal Bureau of Investigation,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,0.0,U,Government,Commercial/Office Building,Unknown,Firearm Accessories,700.0,0,False,
24746,2021,United States Air Force Security Police,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,0.0,U,Individual,Residence/Home,Unknown,Other,350.0,0,False,
68199,2020,United States Agency for International Develop...,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,0.0,U,Government,Parking/Drop Lot/Garage,Unknown,Other,3000000.0,0,False,
73855,2020,United States Agency for International Develop...,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,42.0,M,Government,Parking/Drop Lot/Garage,Unknown,Other,3000000.0,3000000,True,2020-08-16
73856,2020,United States Agency for International Develop...,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,37.0,F,Government,Parking/Drop Lot/Garage,Unknown,Other,3000000.0,3000000,True,2020-08-16
73857,2020,United States Agency for International Develop...,,Federal,FS,Federal,Other,NOT SPECIFIED,Other,,...,25.0,M,Government,Parking/Drop Lot/Garage,Unknown,Other,3000000.0,3000000,True,2020-08-16


In [20]:
# This is a very tricky case... maybe 'Nonlocal'?
df0["population_group_desc"] = df0["population_group_desc"].fillna("Nonlocal")

In [21]:
# How is the progress so far?
df0.isna().sum()

data_year                     0
pub_agency_name               0
pub_agency_unit          150070
agency_type_name              0
state_abbr                    0
state_name                    0
division_name                 0
county_name                   0
region_name                   0
population_group_desc         0
offense_name                  0
offender_race                 0
offender_ethnicity            0
offender_age                  0
offender_sex                  0
victim_type_name              0
location_name                 0
weapon_name                   0
prop_desc_name                0
stolen_value                  0
recovered_value               0
recovered_flag                0
date_recovered           130021
dtype: int64

The "easy" columns are done. The `date_recovered` column does not need to be handled yet, so the main focus is now on the `pub_agency_unit` column which will be more complicated.

In [22]:
# Are the public agency units maybe related to the public agency names?
df0.loc[df0.pub_agency_unit.isna() == False].pub_agency_name.value_counts()

State Police:                              950
State Park Rangers:                        277
University of North Carolina:               44
Independent School District:                23
University of Tennessee:                    21
University of Massachusetts:                17
North Carolina State University             13
University of Arkansas:                     11
West Virginia University:                    9
Division of Law Enforcement                  8
University of Alabama:                       7
Huron-Clinton Metropolitan Authority:        7
University of Texas:                         7
Highway Patrol:                              7
University of South Carolina:                5
Department of Game and Inland Fisheries      4
Port Authority                               3
University of Houston:                       2
University of Wisconsin:                     1
Drug Task Force:                             1
North Carolina State Port Authority:         1
Kaufman Count

So non-null public agency units fit into 25 public agency names. There seems to be some kind of pattern...

In [23]:
# What are their types?
df0.loc[df0.pub_agency_unit.isna() == False].agency_type_name.value_counts()

State Police             957
Other State Agency       289
University or College    140
Other                     36
Name: agency_type_name, dtype: int64

In [24]:
# How does that compare to the entire DataFrame in general?
df0.agency_type_name.value_counts()

City                     109372
County                    38808
State Police               1140
University or College       762
Other                       758
Other State Agency          570
Tribal                       68
Federal                      14
Name: agency_type_name, dtype: int64