In [1]:
import pandas as pd
import numpy as np

In [3]:
#Importing original csv

fbi_og = pd.read_csv("FBIDataSetOriginal.csv")
fbi_og.head()

Unnamed: 0,data_year,ori,pub_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,county_name,region_name,...,weapon_code,weapon_name,prop_desc_code,data_year.1,prop_desc_code.1,prop_desc_name,stolen_value,recovered_value,recovered_flag,date_recovered
0,2021,TX0030000,Angelina,,County,TX,Texas,West South Central,ANGELINA,South,...,,,20,2021,20,Money,375.0,0,False,
1,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,
2,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,65,2021,65,Identity Documents,0.0,0,False,
3,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,13,2021,13,Firearms,320.0,0,False,
4,2021,AL0010000,Jefferson,,County,AL,Alabama,East South Central,JEFFERSON,South,...,,,77,2021,77,Other,1.0,0,False,


In [5]:
#printing out datatypes for ERD and SQL purposes

fbi_og.dtypes

data_year                  int64
ori                       object
pub_agency_name           object
pub_agency_unit           object
agency_type_name          object
state_abbr                object
state_name                object
division_name             object
county_name               object
region_name               object
population_group_code     object
population_group_desc     object
offense_code              object
offense_name              object
offender_race             object
offender_ethnicity        object
offender_age             float64
offender_sex              object
victim_type_code          object
victim_type_name          object
location_code              int64
location_name             object
weapon_code               object
weapon_name               object
prop_desc_code             int64
data_year.1                int64
prop_desc_code.1           int64
prop_desc_name            object
stolen_value             float64
recovered_value            int64
recovered_

In [12]:
#Observing null values and where they are.
fbi_og.isna().sum()

data_year                     0
ori                           0
pub_agency_name               0
pub_agency_unit          150070
agency_type_name              0
state_abbr                    0
state_name                    0
division_name                 0
county_name                   0
region_name                   0
population_group_code         6
population_group_desc         6
offense_code                  0
offense_name                  0
offender_race             68219
offender_ethnicity        68219
offender_age              72765
offender_sex              68219
victim_type_code              0
victim_type_name              0
location_code                 0
location_name                 0
weapon_code              138505
weapon_name              138505
prop_desc_code                0
data_year.1                   0
prop_desc_code.1              0
prop_desc_name                0
stolen_value                271
recovered_value               0
recovered_flag                0
date_rec

In [19]:
#first pass at cutting down 
#pub agency unit seems redundant, so it and others will be dropped to clean up dataframe
#weapon name and code perhaps unneeeded. prop_desc code will most likely include weapons
#date recovered will mostly be blank because never recovered
#removing names to simplify things. codes are provided in attached word doc we should include anyway
fbi_trimmed = fbi_og.drop(columns =["ori","pub_agency_unit","state_name","weapon_code","weapon_name","date_recovered",
                          "prop_desc_code.1","prop_desc_name","victim_type_name","location_name","offense_name","population_group_desc"], axis=1)
fbi_trimmed.head()

Unnamed: 0,data_year,pub_agency_name,agency_type_name,state_abbr,division_name,county_name,region_name,population_group_code,offense_code,offender_race,offender_ethnicity,offender_age,offender_sex,victim_type_code,location_code,prop_desc_code,data_year.1,stolen_value,recovered_value,recovered_flag
0,2021,Angelina,County,TX,West South Central,ANGELINA,South,8B,26B,,,,,I,25,20,2021,375.0,0,False
1,2021,Jefferson,County,AL,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,77,2021,1.0,0,False
2,2021,Jefferson,County,AL,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,65,2021,0.0,0,False
3,2021,Jefferson,County,AL,East South Central,JEFFERSON,South,9A,23H,Unknown,Unknown,0.0,U,I,20,13,2021,320.0,0,False
4,2021,Jefferson,County,AL,East South Central,JEFFERSON,South,9A,23F,Unknown,Unknown,0.0,U,I,20,77,2021,1.0,0,False


In [20]:
#look at new set
#can use this for a baseline SQL chart to begin the ERD at
#can make smaller datasets from here
fbi_trimmed.dtypes

data_year                  int64
pub_agency_name           object
agency_type_name          object
state_abbr                object
division_name             object
county_name               object
region_name               object
population_group_code     object
offense_code              object
offender_race             object
offender_ethnicity        object
offender_age             float64
offender_sex              object
victim_type_code          object
location_code              int64
prop_desc_code             int64
data_year.1                int64
stolen_value             float64
recovered_value            int64
recovered_flag              bool
dtype: object

In [22]:
#checking out nulls again
fbi_trimmed.isna().sum()

data_year                    0
pub_agency_name              0
agency_type_name             0
state_abbr                   0
division_name                0
county_name                  0
region_name                  0
population_group_code        6
offense_code                 0
offender_race            68219
offender_ethnicity       68219
offender_age             72765
offender_sex             68219
victim_type_code             0
location_code                0
prop_desc_code               0
data_year.1                  0
stolen_value               271
recovered_value              0
recovered_flag               0
dtype: int64

In [28]:
#how manyu options for each of these null columns
fbi_trimmed[["offender_race","offender_ethnicity","offender_age","offender_sex"]].nunique()

offender_race          6
offender_ethnicity     4
offender_age          87
offender_sex           3
dtype: int64

In [31]:
#looking at them one by one
fbi_trimmed.offender_race.unique()

array([nan, 'Unknown', 'White', 'Black or African American', 'Asian',
       'American Indian or Alaska Native',
       'Native Hawaiian or Other Pacific Islander'], dtype=object)

In [32]:
#already has "Unknown" included, so let's fix
fbi_trimmed["offender_race"] = fbi_trimmed["offender_race"].fillna("Unknown")

In [40]:
#see if it worked
fbi_trimmed.offender_race.unique()

array(['Unknown', 'White', 'Black or African American', 'Asian',
       'American Indian or Alaska Native',
       'Native Hawaiian or Other Pacific Islander'], dtype=object)

In [41]:
#do it all over again for the next one
fbi_trimmed.offender_ethnicity.unique()

array([nan, 'Unknown', 'Not Hispanic or Latino', 'Not Specified',
       'Hispanic or Latino'], dtype=object)

In [42]:
fbi_trimmed["offender_ethnicity"] = fbi_trimmed["offender_ethnicity"].fillna("Unknown")

In [43]:
#that's done
fbi_trimmed.offender_ethnicity.unique()

array(['Unknown', 'Not Hispanic or Latino', 'Not Specified',
       'Hispanic or Latino'], dtype=object)

In [44]:
fbi_trimmed.offender_age.unique()

array([nan,  0., 99., 20., 28., 37., 29., 35., 32., 36., 24., 16., 52.,
       31., 41., 40., 45., 26., 34., 56., 50., 47., 38., 61., 42., 39.,
       18., 33., 69., 27., 58., 25., 14., 43., 30., 22., 23., 49., 17.,
       21., 53., 15., 60., 46., 51., 62.,  1., 63., 48., 55., 19., 54.,
       98., 13., 65., 64., 10., 44., 12., 71.,  9., 81., 66., 57., 59.,
       70., 68.,  5., 67., 73., 75., 85., 80., 72., 11., 74.,  7., 86.,
       78., 76.,  8.,  6., 77., 83., 79., 87., 82.,  4.])

In [45]:
#adjust age assumming that 0 = unknown
fbi_trimmed.offender_age = fbi_trimmed.offender_age.fillna(0)

In [50]:
#Moving on to Gender
fbi_trimmed.offender_sex.unique()

array([nan, 'U', 'M', 'F'], dtype=object)

In [51]:
#U for unknown, replacing
fbi_trimmed.offender_sex = fbi_trimmed.offender_sex.fillna("U")

In [54]:
#and finally to adjust those null values in cost.
fbi_trimmed["stolen_value"] = fbi_trimmed["stolen_value"].fillna(0.0)

In [57]:
#looks like a null values have been handled for this particular set
fbi_trimmed.isna().sum()

data_year                0
pub_agency_name          0
agency_type_name         0
state_abbr               0
division_name            0
county_name              0
region_name              0
population_group_code    6
offense_code             0
offender_race            0
offender_ethnicity       0
offender_age             0
offender_sex             0
victim_type_code         0
location_code            0
prop_desc_code           0
data_year.1              0
stolen_value             0
recovered_value          0
recovered_flag           0
dtype: int64

In [59]:
#datasets again for SQL
fbi_trimmed.dtypes

data_year                  int64
pub_agency_name           object
agency_type_name          object
state_abbr                object
division_name             object
county_name               object
region_name               object
population_group_code     object
offense_code              object
offender_race             object
offender_ethnicity        object
offender_age             float64
offender_sex              object
victim_type_code          object
location_code              int64
prop_desc_code             int64
data_year.1                int64
stolen_value             float64
recovered_value            int64
recovered_flag              bool
dtype: object

In [61]:
#this CSV could be a good jumping off point. can split diagrams off of it perhaps
#saving it

fbi_trimmed.to_csv("FBI_DATA_CLEANED1.csv")