# Cargo Theft Data Clustering
## Import and Trim a Little More

In [1]:
# Start with magic
%matplotlib inline

# Import the necessary dependencies.
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [2]:
# Get the Crime Report data.
df = pd.read_csv("Datasets/CrimeReport.csv")
df.head()

Unnamed: 0,data_year,pub_agency_name,agency_type_name,state_name,division_name,county_name,region_name,population_group_code,population_group_desc,offense_code,...,location_code,location_name,weapon_code,weapon_name,prop_desc_code,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
0,2021,Angelina,County,Texas,West South Central,ANGELINA,South,8B,"Non-MSA counties from 25,000 thru 99,999",26B,...,25,Other/Unknown,95,Unknown,20,Money,375.0,0,False,
1,2021,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,"MSA counties 100,000 or over",23H,...,20,Residence/Home,95,Unknown,77,Other,1.0,0,False,
2,2021,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,"MSA counties 100,000 or over",23H,...,20,Residence/Home,95,Unknown,65,Identity Documents,0.0,0,False,
3,2021,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,"MSA counties 100,000 or over",23H,...,20,Residence/Home,95,Unknown,13,Firearms,320.0,0,False,
4,2021,Jefferson,County,Alabama,East South Central,JEFFERSON,South,9A,"MSA counties 100,000 or over",23F,...,20,Residence/Home,95,Unknown,77,Other,1.0,0,False,


In [3]:
# The dates, the agency, the offender, and the weapon will not be factored into this.
# The state is favored as the middle between the county and the region/division.
# Thus, these shall all be dropped.
dropped = ["data_year", "pub_agency_name", "agency_type_name",
           "division_name", "county_name", "region_name",
           "offender_race", "offender_ethnicity", "offender_age",
           "offender_sex", "weapon_code", "weapon_name", "date_recovered"]
df0 = df.drop(columns = dropped, axis = 1)
df0.head()

Unnamed: 0,state_name,population_group_code,population_group_desc,offense_code,offense_name,victim_type_code,victim_type_name,location_code,location_name,prop_desc_code,prop_desc_name,stolen_value,recovered_value,recovered_flag
0,Texas,8B,"Non-MSA counties from 25,000 thru 99,999",26B,Credit Card/Automated Teller Machine Fraud,I,Individual,25,Other/Unknown,20,Money,375.0,0,False
1,Alabama,9A,"MSA counties 100,000 or over",23H,All Other Larceny,I,Individual,20,Residence/Home,77,Other,1.0,0,False
2,Alabama,9A,"MSA counties 100,000 or over",23H,All Other Larceny,I,Individual,20,Residence/Home,65,Identity Documents,0.0,0,False
3,Alabama,9A,"MSA counties 100,000 or over",23H,All Other Larceny,I,Individual,20,Residence/Home,13,Firearms,320.0,0,False
4,Alabama,9A,"MSA counties 100,000 or over",23F,Theft From Motor Vehicle,I,Individual,20,Residence/Home,77,Other,1.0,0,False


## I. First Selection - The Stolen Values of Objects as a Whole

In [4]:
# Set up the DataFrame.
stolen_df = df0[["state_name", "population_group_code", "offense_code", "victim_type_code",
                 "location_code", "prop_desc_code", "stolen_value"]]
stolen_df.head()

Unnamed: 0,state_name,population_group_code,offense_code,victim_type_code,location_code,prop_desc_code,stolen_value
0,Texas,8B,26B,I,25,20,375.0
1,Alabama,9A,23H,I,20,77,1.0
2,Alabama,9A,23H,I,20,65,0.0
3,Alabama,9A,23H,I,20,13,320.0
4,Alabama,9A,23F,I,20,77,1.0


In [5]:
# Null values should have been dealt with, but are there any duplicates again?
stolen_df.duplicated().sum()

24697

In [6]:
# Deal with the duplicates.
stolen_df.drop_duplicates()

Unnamed: 0,state_name,population_group_code,offense_code,victim_type_code,location_code,prop_desc_code,stolen_value
0,Texas,8B,26B,I,25,20,375.0
1,Alabama,9A,23H,I,20,77,1.0
2,Alabama,9A,23H,I,20,65,0.0
3,Alabama,9A,23H,I,20,13,320.0
4,Alabama,9A,23F,I,20,77,1.0
...,...,...,...,...,...,...,...
122951,Florida,9A,23F,B,25,37,10000.0
122952,Florida,9A,23F,B,25,36,35.0
122953,South Carolina,9A,240,B,18,78,20000.0
122954,Tennessee,1B,23F,B,7,2,32.0


## II. Second Selection - Look at the Recovered Subset

In [7]:
# Set up the DataFrame.
recov_df = df0.loc[df0.recovered_flag == True][["state_name", "population_group_code", "offense_code",
                                                "victim_type_code", "location_code", "prop_desc_code",
                                                "stolen_value", "recovered_value"]]
recov_df.head()

Unnamed: 0,state_name,population_group_code,offense_code,victim_type_code,location_code,prop_desc_code,stolen_value,recovered_value
16,Alabama,9B,23H,I,20,9,0.0,0
42,West Virginia,9A,23H,I,25,77,5000.0,5000
46,Alabama,2,240,I,20,3,12000.0,12000
49,Alabama,2,240,I,20,3,11000.0,11000
66,Alabama,2,240,I,18,3,20000.0,20000


In [8]:
# Find the recovery percentage.
recov_df["recovery_rate"] = np.where(recov_df.stolen_value > 0,
                                     recov_df.recovered_value / recov_df.stolen_value, 0)
recov_df.head()

Unnamed: 0,state_name,population_group_code,offense_code,victim_type_code,location_code,prop_desc_code,stolen_value,recovered_value,recovery_rate
16,Alabama,9B,23H,I,20,9,0.0,0,0.0
42,West Virginia,9A,23H,I,25,77,5000.0,5000,1.0
46,Alabama,2,240,I,20,3,12000.0,12000,1.0
49,Alabama,2,240,I,20,3,11000.0,11000,1.0
66,Alabama,2,240,I,18,3,20000.0,20000,1.0


In [9]:
# Null values should have been dealt with, but are there any duplicates again?
recov_df.duplicated().sum()

4239

In [10]:
# Deal with the duplicates.
recov_df.drop_duplicates()

Unnamed: 0,state_name,population_group_code,offense_code,victim_type_code,location_code,prop_desc_code,stolen_value,recovered_value,recovery_rate
16,Alabama,9B,23H,I,20,9,0.0,0,0.0
42,West Virginia,9A,23H,I,25,77,5000.0,5000,1.0
46,Alabama,2,240,I,20,3,12000.0,12000,1.0
49,Alabama,2,240,I,20,3,11000.0,11000,1.0
66,Alabama,2,240,I,18,3,20000.0,20000,1.0
...,...,...,...,...,...,...,...,...,...
122938,South Carolina,7,23F,B,18,7,3000.0,3000,1.0
122944,Florida,9A,240,B,0,37,10000.0,10000,1.0
122951,Florida,9A,23F,B,25,37,10000.0,10000,1.0
122953,South Carolina,9A,240,B,18,78,20000.0,20000,1.0
