# Cargo Theft Dataset Pre-Processing

In [1]:
# Import the necessary dependencies
import pandas as pd
import numpy as np

### Preprocessing the Data 

In [2]:
# Read the CSV file into a DataFrame.
theft_df = pd.read_csv("Resources/CT_2013_2021.csv")
theft_df.head()

Unnamed: 0,data_year,ori,pub_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,...,weapon_code,weapon_name,prop_desc_code,data_year.1,prop_desc_code.1,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
0,2021,TX0030000,Angelina,,County,TX,Texas,West South Central,ANGELINA,South,...,,,20,2021,20,Money,375.0,0,False,
1,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,
2,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,65,2021,65,Identity Documents,0.0,0,False,
3,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,13,2021,13,Firearms,320.0,0,False,
4,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,


In [3]:
# Data information
theft_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151492 entries, 0 to 151491
Data columns (total 32 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   data_year              151492 non-null  int64  
 1   ori                    151492 non-null  object 
 2   pub_agency_name        151492 non-null  object 
 3   pub_agency_unit        1422 non-null    object 
 4   agency_type_name       151492 non-null  object 
 5   state_abbr             151492 non-null  object 
 6   state_name             151492 non-null  object 
 7   division_name          151492 non-null  object 
 8   county_name            151492 non-null  object 
 9   region_name            151492 non-null  object 
 10  population_group_code  151486 non-null  object 
 11  population_group_desc  151486 non-null  object 
 12  offense_code           151492 non-null  object 
 13  offense_name           151492 non-null  object 
 14  offender_race          83273 non-nul

In [4]:
# How many different values are each column
theft_df.nunique()

data_year                   9
ori                      3912
pub_agency_name          2935
pub_agency_unit           147
agency_type_name            8
state_abbr                 50
state_name                 50
division_name              10
county_name              1227
region_name                 5
population_group_code      19
population_group_desc      19
offense_code               40
offense_name               40
offender_race               6
offender_ethnicity          4
offender_age               87
offender_sex                3
victim_type_code            9
victim_type_name            9
location_code              47
location_name              47
weapon_code                19
weapon_name                19
prop_desc_code             68
data_year.1                 9
prop_desc_code.1           68
prop_desc_name             68
stolen_value             5265
recovered_value          1144
recovered_flag              2
date_recovered           2431
dtype: int64

In [5]:
# Null Values 
theft_df.isna().sum()

data_year                     0
ori                           0
pub_agency_name               0
pub_agency_unit          150070
agency_type_name              0
state_abbr                    0
state_name                    0
division_name                 0
county_name                   0
region_name                   0
population_group_code         6
population_group_desc         6
offense_code                  0
offense_name                  0
offender_race             68219
offender_ethnicity        68219
offender_age              72765
offender_sex              68219
victim_type_code              0
victim_type_name              0
location_code                 0
location_name                 0
weapon_code              138505
weapon_name              138505
prop_desc_code                0
data_year.1                   0
prop_desc_code.1              0
prop_desc_name                0
stolen_value                271
recovered_value               0
recovered_flag                0
date_rec

In [6]:
# Duplicated Values
theft_df.duplicated().sum()

14518

In [7]:
data_theft_df = theft_df.drop(columns=['pub_agency_unit', 'weapon_code', 'weapon_name','prop_desc_code','data_year.1', 'date_recovered'])
print(data_theft_df.shape)
data_theft_df.head()              

(151492, 26)


Unnamed: 0,data_year,ori,pub_agency_name,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,population_group_code,...,offender_sex,victim_type_code,victim_type_name,location_code,location_name,prop_desc_code.1,prop_desc_name,stolen_value,recovered_value,recovered_flag
0,2021,TX0030000,Angelina,County,TX,Texas,West South Central,ANGELINA,South,8B,...,,I,Individual,25,Other/Unknown,20,Money,375.0,0,False
1,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,77,Other,1.0,0,False
2,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,65,Identity Documents,0.0,0,False
3,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,13,Firearms,320.0,0,False
4,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,77,Other,1.0,0,False


In [8]:
# Transformation
data_theft_df['offender_race'].fillna('Unknown', inplace = True)
data_theft_df['offender_ethnicity'].fillna('Unknown', inplace = True)
data_theft_df['offender_age'].fillna(0.0, inplace = True)
data_theft_df['offender_sex'].fillna('U', inplace = True)
data_theft_df['stolen_value'].fillna(0, inplace = True)

In [9]:
# Remove rows that have at least 1 null value. 
theft_cleaned_df = data_theft_df.dropna(subset=['population_group_code','population_group_desc'])
theft_cleaned_df.isna().sum()

data_year                0
ori                      0
pub_agency_name          0
agency_type_name         0
state_abbr               0
state_name               0
division_name            0
county_name              0
region_name              0
population_group_code    0
population_group_desc    0
offense_code             0
offense_name             0
offender_race            0
offender_ethnicity       0
offender_age             0
offender_sex             0
victim_type_code         0
victim_type_name         0
location_code            0
location_name            0
prop_desc_code.1         0
prop_desc_name           0
stolen_value             0
recovered_value          0
recovered_flag           0
dtype: int64

In [10]:
print(theft_cleaned_df.shape)
theft_cleaned_df.head()  

(151486, 26)


Unnamed: 0,data_year,ori,pub_agency_name,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,population_group_code,...,offender_sex,victim_type_code,victim_type_name,location_code,location_name,prop_desc_code.1,prop_desc_name,stolen_value,recovered_value,recovered_flag
0,2021,TX0030000,Angelina,County,TX,Texas,West South Central,ANGELINA,South,8B,...,U,I,Individual,25,Other/Unknown,20,Money,375.0,0,False
1,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,77,Other,1.0,0,False
2,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,65,Identity Documents,0.0,0,False
3,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,13,Firearms,320.0,0,False
4,2021,AL0010000,Jefferson,County,AL,Alabama,East South Central,JEFFERSON,South,9A,...,U,I,Individual,20,Residence/Home,77,Other,1.0,0,False


In [11]:
theft_cleaned_df.to_csv('Resources/cargo_theft_cleaned.csv', index=False)