<a href="https://colab.research.google.com/github/MattIzon/16010269_DataAnalytics/blob/main/1_Crime_Aquisition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# This redirects to get an auth key from your google account. Ctrl + v it into the box and hit enter.
# It sometimes fails on first run then authenticates immediately on second click?

from google.colab import auth, files
auth.authenticate_user()
print('Authenticated')

from google.cloud import bigquery
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Authenticated
Mounted at /gdrive


In [5]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
# This uses big query to get the crime data

client = bigquery.Client(project='assignment-305921')

crime = []

for year in range(2016, 2021):
  crime.append(client.query('''
  SELECT *
  FROM `bigquery-public-data.chicago_crime.crime`
  WHERE year = {}
  ORDER BY date
  '''.format(year)).to_dataframe())

In [7]:
# inspect the data and decide:
# a) Which columns may be useful
# b) Which columns need prep before they can be used - missing data, wrong format etc

crime[0].head()

# At first glance: unique_key, case_number, block, description, arrest, fbi_code, x_coordinate, y_coordinate, year, updated_on dont appear to be useful.

# date requires reformatting

# Types of crime could increase on certain days.
# iucr seems to be a reduction of primary_type and description into a single field.
# This needs investigating and decision made as to which type of categorisation is more useful.

# location_description could show crime increasing in certain locations on given days

# domestic crime could increase on weekends which would impact crime rates on particular days

# beat, district, ward and community_area provide different levels of location boundary to a crime. 
# Crime patterns could emerge in specific locals. A choice of boundary level needs to be made.

# latitude and longitude / location provide a means of connection with weather data.
# location is a reduction of lat / lon. If data is missing one may be used to fill the other.


Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,10597322,HZ348532,2016-01-01 01:00:00+00:00,057XX N KIMBALL AVE,890,THEFT,FROM BUILDING,RESIDENCE,False,False,1711,17,39,13,06,1152676.0,1937936.0,2016,2018-02-10 03:50:01+00:00,41.985557,-87.713835,"(41.985556978, -87.713834875)"
1,10629384,HZ382257,2016-01-01 01:00:00+00:00,032XX N LAKE SHORE DR,810,THEFT,OVER $500,APARTMENT,False,False,1925,19,44,6,06,1172975.0,1922045.0,2016,2018-02-10 03:50:01+00:00,41.941525,-87.63965,"(41.941524519, -87.639649996)"
2,10364686,HZ100059,2016-01-01 01:00:00+00:00,020XX N KARLOV AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,2525,25,30,20,08B,1148719.0,1913304.0,2016,2018-02-10 03:50:01+00:00,41.918042,-87.729027,"(41.918042456, -87.729027375)"
3,10364941,HZ100419,2016-01-01 01:00:00+00:00,050XX W CONCORD PL,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,2533,25,37,25,14,1142595.0,1910502.0,2016,2018-02-10 03:50:01+00:00,41.91047,-87.751597,"(41.910469677, -87.751597381)"
4,10365294,HZ100944,2016-01-01 01:00:00+00:00,040XX W LAKE ST,820,THEFT,$500 AND UNDER,CTA TRAIN,False,False,1114,11,28,26,06,1149512.0,1901446.0,2016,2018-02-10 03:50:01+00:00,41.885488,-87.726422,"(41.885487535, -87.726422045)"


In [10]:
# Immediately drop the unnecessary fields
no_use = ['unique_key', 'case_number', 'block', 'description', 'arrest', 'fbi_code', 'x_coordinate', 'y_coordinate', 'year', 'updated_on']

for year in crime:
  year.drop(no_use, axis=1, inplace=True)

In [23]:
# Convert the datetime to date column

for year in crime:
  year['date'] = pd.to_datetime(year['date']).dt.date

# Check for missing data
for year in crime:
  print(year['date'].isna().any())

False
False
False
False
False


In [22]:
crime[0]['date'][0] = '2016-01-01'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
crime[0].head()

Unnamed: 0,date,iucr,primary_type,location_description,domestic,beat,district,ward,community_area,latitude,longitude,location
0,2016-01-01,890,THEFT,RESIDENCE,False,1711,17,39,13,41.985557,-87.713835,"(41.985556978, -87.713834875)"
1,2016-01-01,810,THEFT,APARTMENT,False,1925,19,44,6,41.941525,-87.63965,"(41.941524519, -87.639649996)"
2,2016-01-01,486,BATTERY,RESIDENCE,True,2525,25,30,20,41.918042,-87.729027,"(41.918042456, -87.729027375)"
3,2016-01-01,1320,CRIMINAL DAMAGE,STREET,False,2533,25,37,25,41.91047,-87.751597,"(41.910469677, -87.751597381)"
4,2016-01-01,820,THEFT,CTA TRAIN,False,1114,11,28,26,41.885488,-87.726422,"(41.885487535, -87.726422045)"


In [None]:
# Save crime data to CSV files.

names = ['bq_2016', 'bq_2017', 'bq_2018', 'bq_2019', 'bq_2020']

# for index in range(len(names)):
#   with open('/gdrive/My Drive/assignment/data/crime/{}.csv'.format(names[index]), 'w') as f:
#     crime[index].to_csv(f)

for index in range(len(names)):
  crime[index].to_csv('{}.csv'.format(names[index])) 
  files.download('{}.csv'.format(names[index]))
