<a href="https://colab.research.google.com/github/MattIzon/16010269_DataAnalytics/blob/main/1_Crime_Aquisition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Set-up
from google.colab import auth, files
auth.authenticate_user()
print('Authenticated')

from google.cloud import bigquery
# from google.colab import drive
# drive.mount('/gdrive', force_remount=True)
import pandas as pd
pd.set_option('display.max_columns', None)

Authenticated
Mounted at /gdrive


In [48]:
# Use big query to get the crime data
client = bigquery.Client(project='assignment-305921')

crime = []

for year in range(2016, 2021):
  crime.append(client.query('''
  SELECT *
  FROM `bigquery-public-data.chicago_crime.crime`
  WHERE year = {}
  ORDER BY date
  '''.format(year)).to_dataframe())

In [49]:
# inspect the data and decide:
# a) Which columns may be useful
# b) Which columns need prep before they can be used - missing data, wrong format etc

crime[0].head()

# At first glance: unique_key, case_number, block, description, arrest, fbi_code, x_coordinate, y_coordinate, year, updated_on dont appear to have any bearing on crime numbers.

# date requires reformatting

# Types of crime could increase on certain days.
# iucr seems to be a reduction of primary_type and description into a single field.
# This needs investigating and decision made as to which type of categorisation is more useful.

# location_description could show crime increasing in certain locations on given days

# domestic crime could increase on weekends which would impact crime rates on particular days

# beat, district, ward and community_area provide different levels of location boundary to a crime. 
# Crime patterns could emerge in specific locals. A choice of boundary level needs to be made.

# latitude and longitude / location provide a means of connection with weather data.
# location is a reduction of lat / lon. If data is missing one may be used to fill the other.

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,10597322,HZ348532,2016-01-01 01:00:00+00:00,057XX N KIMBALL AVE,0890,THEFT,FROM BUILDING,RESIDENCE,False,False,1711,17,39,13,06,1152676.0,1937936.0,2016,2018-02-10 03:50:01+00:00,41.985557,-87.713835,"(41.985556978, -87.713834875)"
1,10366068,HZ101782,2016-01-01 01:00:00+00:00,037XX N LAKEWOOD AVE,0620,BURGLARY,UNLAWFUL ENTRY,APARTMENT,False,False,1923,19,44,6,05,1166964.0,1924916.0,2016,2018-02-10 03:50:01+00:00,41.949534,-87.66166,"(41.949534028, -87.661660031)"
2,10365294,HZ100944,2016-01-01 01:00:00+00:00,040XX W LAKE ST,0820,THEFT,$500 AND UNDER,CTA TRAIN,False,False,1114,11,28,26,06,1149512.0,1901446.0,2016,2018-02-10 03:50:01+00:00,41.885488,-87.726422,"(41.885487535, -87.726422045)"
3,10364804,HZ100234,2016-01-01 01:00:00+00:00,009XX W 78TH ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,621,6,17,71,08B,1171211.0,1853110.0,2016,2018-02-10 03:50:01+00:00,41.7524,-87.648155,"(41.752400042, -87.648154805)"
4,10417935,HZ100345,2016-01-01 01:00:00+00:00,094XX S PARNELL AVE,031A,ROBBERY,ARMED: HANDGUN,STREET,False,False,2223,22,21,73,03,1174288.0,1842388.0,2016,2018-02-10 03:50:01+00:00,41.72291,-87.637197,"(41.722909672, -87.637196769)"


In [50]:
# Immediately drop the unnecessary fields
no_use = ['unique_key', 'case_number', 'block', 'description', 'arrest', 'fbi_code', 'x_coordinate', 'y_coordinate', 'year', 'updated_on']
for year in crime:
  year.drop(no_use, axis=1, inplace=True)

In [52]:
# Check for missing data
index = 0
for year in crime:
  print('Missing Data ', 2016 + index, ':')
  for column in year:
    print('  ', column, ': ', year[column].isna().any())
  index += 1
  print()
# - location_description, district (2017), ward (2017/2018/2019/2020), community_area (2020), latitude, longitude and location have missing data
# - district, ward and community_area are linked
# - latitude, longitude and location are linked

Missing Data  2016 :
   date :  False
   iucr :  False
   primary_type :  False
   location_description :  True
   domestic :  False
   beat :  False
   district :  False
   ward :  False
   community_area :  False
   latitude :  True
   longitude :  True
   location :  True

Missing Data  2017 :
   date :  False
   iucr :  False
   primary_type :  False
   location_description :  True
   domestic :  False
   beat :  False
   district :  True
   ward :  True
   community_area :  False
   latitude :  True
   longitude :  True
   location :  True

Missing Data  2018 :
   date :  False
   iucr :  False
   primary_type :  False
   location_description :  True
   domestic :  False
   beat :  False
   district :  False
   ward :  True
   community_area :  False
   latitude :  True
   longitude :  True
   location :  True

Missing Data  2019 :
   date :  False
   iucr :  False
   primary_type :  False
   location_description :  True
   domestic :  False
   beat :  False
   district :  False
 

In [55]:
# Inspect missing data
index = 0
for year in crime:
  print(2016 + index, ':')
  total_rows = year.shape[0]
  print('Total rows: ', total_rows)
  missing_columns = ['location_description', 'district', 'ward', 'community_area', 'latitude', 'longitude', 'location']
  for column in missing_columns:
    missing_count = sum(year[column].isna())
    print(column, ' missing count: ', missing_count, ' - Percentage of total: ', (missing_count/total_rows)*100)
  index += 1
  print()

# - latitude, longitude and location have equal numbers of missing data. If the missing items are all in the same rows the location column is not required.
# - Potentially district, ward and community_area can be patched using each other's values. Further inspection to determine their usefulness should be completed first.

2016 :
Total rows:  269706
location_description  missing count:  1276  - Percentage of total:  0.47310775436957275
district  missing count:  0  - Percentage of total:  0.0
ward  missing count:  0  - Percentage of total:  0.0
community_area  missing count:  0  - Percentage of total:  0.0
latitude  missing count:  2451  - Percentage of total:  0.908767324419924
longitude  missing count:  2451  - Percentage of total:  0.908767324419924
location  missing count:  2451  - Percentage of total:  0.908767324419924

2017 :
Total rows:  268951
location_description  missing count:  1276  - Percentage of total:  0.47443586378187846
district  missing count:  1  - Percentage of total:  0.0003718149402679298
ward  missing count:  1  - Percentage of total:  0.0003718149402679298
community_area  missing count:  0  - Percentage of total:  0.0
latitude  missing count:  4005  - Percentage of total:  1.489118835773059
longitude  missing count:  4005  - Percentage of total:  1.489118835773059
location  missi

In [56]:
# Check if latitude, longitude and location's missing data are all in the same rows.
index = 0
for year in crime:
  print(sum(year['latitude'].isna() & year['longitude'].isna() & year['location'].isna()))
  index += 1
  print()

# - It is. Location column can be dropped.

2451

4005

5143

1889

3288



In [57]:
# Drop the location field
no_use = ['location']
for year in crime:
  year.drop(no_use, axis=1, inplace=True)

In [58]:
print(crime[0].head())

                       date  iucr primary_type location_description  domestic  \
0 2016-01-01 01:00:00+00:00  0890        THEFT            RESIDENCE     False   
1 2016-01-01 01:00:00+00:00  0620     BURGLARY            APARTMENT     False   
2 2016-01-01 01:00:00+00:00  0820        THEFT            CTA TRAIN     False   
3 2016-01-01 01:00:00+00:00  0486      BATTERY            APARTMENT      True   
4 2016-01-01 01:00:00+00:00  031A      ROBBERY               STREET     False   

   beat  district  ward  community_area   latitude  longitude  
0  1711        17    39              13  41.985557 -87.713835  
1  1923        19    44               6  41.949534 -87.661660  
2  1114        11    28              26  41.885488 -87.726422  
3   621         6    17              71  41.752400 -87.648155  
4  2223        22    21              73  41.722910 -87.637197  


In [6]:
# Convert the datetime to date column
for year in crime:
  year['date'] = pd.to_datetime(year['date']).dt.date

# Check date for missing data
for year in crime:
  print(year['date'].isna().any())
# - No missing data

False


In [None]:
crime[0]['date'][0] = '2016-01-01'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
crime[0].head()

Unnamed: 0,date,iucr,primary_type,location_description,domestic,beat,district,ward,community_area,latitude,longitude,location
0,2016-01-01,890,THEFT,RESIDENCE,False,1711,17,39,13,41.985557,-87.713835,"(41.985556978, -87.713834875)"
1,2016-01-01,810,THEFT,APARTMENT,False,1925,19,44,6,41.941525,-87.63965,"(41.941524519, -87.639649996)"
2,2016-01-01,486,BATTERY,RESIDENCE,True,2525,25,30,20,41.918042,-87.729027,"(41.918042456, -87.729027375)"
3,2016-01-01,1320,CRIMINAL DAMAGE,STREET,False,2533,25,37,25,41.91047,-87.751597,"(41.910469677, -87.751597381)"
4,2016-01-01,820,THEFT,CTA TRAIN,False,1114,11,28,26,41.885488,-87.726422,"(41.885487535, -87.726422045)"


In [7]:
# Check iucr and primary_type for missing data
index = 0
print('Missing Data:')
for year in crime:
  print(2016 + index, ':')
  print('iucr: ', year['iucr'].isna().any())
  print('primary_type: ', year['primary_type'].isna().any())
  index += 1
  print()
# - No missing data


# Check iucr and primary_type for unique records
index = 0
print('Unique Data:')
for year in crime:
  print(2016 + index, ':')
  print('iucr: ', len(year['iucr'].unique()))
  print('primary_type: ', len(year['primary_type'].unique()))
  index += 1
  print()
# - I believe primary_type would be more useful as it has fewer categories and looks at crime types from a higher level.
# - I think iucr's 300+ categories could dilute the data too much for Linear Regression, however it may prove useful to the DNN?
# - Have two datasets with only one in each?

Missing Data:
2016 :
iucr:  False
primary_type:  False

Unique Data:
2016 :
iucr:  326
primary_type:  34



In [12]:
crime[0]['counts'] = crime[0].location_description.map(crime[0].location_description.value_counts())
print(crime[0])

              date  iucr     primary_type location_description  domestic  \
0       2016-01-01  0820            THEFT            RESIDENCE     False   
1       2016-01-01  0460          BATTERY            CTA TRAIN     False   
2       2016-01-01  0870            THEFT  TAVERN/LIQUOR STORE     False   
3       2016-01-01  1320  CRIMINAL DAMAGE               STREET     False   
4       2016-01-01  1562      SEX OFFENSE            APARTMENT      True   
...            ...   ...              ...                  ...       ...   
269701  2016-12-31  2820    OTHER OFFENSE            RESIDENCE     False   
269702  2016-12-31  0810            THEFT            CTA TRAIN     False   
269703  2016-12-31  0486          BATTERY            APARTMENT      True   
269704  2016-12-31  041A          BATTERY    CONVENIENCE STORE     False   
269705  2016-12-31  0460          BATTERY        BAR OR TAVERN     False   

        beat  district  ward  community_area   latitude  longitude  \
0       1921     

In [None]:
# Save crime data to CSV files.

names = ['bq_2016', 'bq_2017', 'bq_2018', 'bq_2019', 'bq_2020']

# for index in range(len(names)):
#   with open('/gdrive/My Drive/assignment/data/crime/{}.csv'.format(names[index]), 'w') as f:
#     crime[index].to_csv(f)

for index in range(len(names)):
  crime[index].to_csv('{}.csv'.format(names[index])) 
  files.download('{}.csv'.format(names[index]))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>