## 1. Import dependencies

In [1]:
import pandas as pd
# !pip install openpyxl
import openpyxl
from sqlalchemy import create_engine




## 2. Retrieve Data from the excel file

In [2]:
excel_file = 'Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2022.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)
sheets = all_sheets.keys()

for sheet_name in sheets:
    sheet = pd.read_excel(excel_file, sheet_name=sheet_name)
    sheet.to_csv("%s.csv" % sheet_name, index=False)

Contents	
Table 01

* Criminal incidents and rate per 100,000 population by police region and local government area - April 2012 to March 2022

Table 02	

* Criminal incidents and rate per 100,000 population by principal offence, local government area and police service area - April 2012 to March 2022

Table 03	

* Criminal incidents by principal offence, local government area and postcode or suburb/town - April 2012 to March 2022

Table 04	

* Criminal incidents by location type and local government area - April 2012 to March 2022

Table 05	

* Criminal incidents by charge status and local government area - April 2012 to March 2022
	
### Data exclusions	
Table 01 

* excludes criminal incidents where the geographic location is unknown. For further information of these geographic locations please refer to the Explanatory Notes and Glossary on the website.	

Tables 02 - 05 

* excludes criminal incidents at Justice institutions and immigration facilities, Unincorporated Victoria and where the geographic location is unknown. For further information of these geographic locations please refer to the Explanatory Notes and Glossary on the website.	
	
Notes	
Recorded crime statistics are based on data extracted by Victoria police on the 18th day after the reference period, and are subject to movement between releases. For more information about how statistics are compiled, refer to the Explanatory notes on the CSA website.	
In order to maintain confidentiality, sensitive offence counts for subdivisions 'A10 Homicide and related offences' and 'A30 Sexual offences' with a value of 3 or less are given a value of 2 to calculate totals.	
Rates are based on populations provided by the Australian Bureau of Statistics (ABS). The most recent year of data was not available from the ABS in time for it to be used to calculate current year rates. The CSA uses estimates created by the Victorian State Government 'Victoria in Futures' report. 	
This work is licenced under a Creative Commons Attribution 4.0 International License. When reporting this data you must attribute the Crime Statistics Agency (or CSA) as the source.

## Loop thru excel file to convert various worksheets to csv files

In [3]:
#Create list of dataframes from csv files
file_list = []
for i in range(1,7):
    file_list.append(f"Table 0{i}.csv")

df_list = [pd.read_csv(file) for file in file_list]
file_list

['Table 01.csv',
 'Table 02.csv',
 'Table 03.csv',
 'Table 04.csv',
 'Table 05.csv',
 'Table 06.csv']

Create police_region_df from Table1 which is high level data, starts from Police Region
(PR, a police regions have a one to many relationshgip with Police Service Area and local Government Areas )

In [4]:
# Exract rows that are greater than year 2018 (use 4 years data, 2019-2022) 
# and drop column 'Year ending' as the crime tables are created on March (no date details) every year.

police_region_df = df_list[0].loc[df_list[0].Year > 2018].drop('Year ending',axis=1)
police_region_df["Local Government Area"] = police_region_df["Local Government Area"].str.strip()

# Drop Total(summary) rows
print(f"Before Total row drop: {police_region_df.shape}")
police_region_df = police_region_df.drop(police_region_df[(police_region_df['Local Government Area'] == 'Total')].index)
print(f"After Total row drop: {police_region_df.shape}")
police_region_df.head()


Before Total row drop: (348, 5)
After Total row drop: (324, 5)


Unnamed: 0,Year,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,2022,1 North West Metro,Banyule,5244,4086.865525
1,2022,1 North West Metro,Brimbank,12330,6200.95948
2,2022,1 North West Metro,Darebin,9276,5751.475727
3,2022,1 North West Metro,Hobsons Bay,4599,4804.896128
4,2022,1 North West Metro,Hume,12196,4919.684615


In [11]:
# police_region_df.dtypes

Create police_division_df from Table2 which is in Police Service Area (PSA) and Local Government Area (LGA), 
breaks down to different crime/offence and subtype types

In [5]:
# Due to Heroku free data capacity allowance, we use data row of year 2022.
police_division_df = df_list[1].loc[df_list[1].Year > 2021].drop('Year ending',axis=1)
print(f"table shape: {police_division_df.shape}")
police_division_df.head()

table shape: (5017, 9)


Unnamed: 0,Year,Police Service Area,Local Government Area,Offence Division,Offence Subdivision,Offence Subgroup,Incidents Recorded,"PSA Rate per 100,000 population","LGA Rate per 100,000 population"
0,2022,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089
1,2022,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A211 FV Serious assault,120,97.418429,103.874128
2,2022,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,147,119.337575,127.245807
3,2022,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,"A22 Assault police, emergency services or othe...",53,43.026473,45.87774
4,2022,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A231 FV Common assault,222,180.224093,192.167137


In [13]:
# police_division_df.dtypes 

Merged Police Region column on Local Government Area Column to create a more complete data
set to enable consisent filtering for data visualisati0n 

In [6]:
police_division_df=pd.merge(police_division_df, police_region_df[["Police Region","Local Government Area"]],
          how="left", on="Local Government Area")
police_division_df=police_division_df[police_division_df["Year"]> 2021]
print(f"table shape: {police_division_df.shape}")
police_division_df.head()     

table shape: (20068, 10)


Unnamed: 0,Year,Police Service Area,Local Government Area,Offence Division,Offence Subdivision,Offence Subgroup,Incidents Recorded,"PSA Rate per 100,000 population","LGA Rate per 100,000 population",Police Region
0,2022,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089,4 Western
1,2022,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089,4 Western
2,2022,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089,4 Western
3,2022,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089,4 Western
4,2022,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A211 FV Serious assault,120,97.418429,103.874128,4 Western


In [7]:
# remove pre 2021 data thst was added when the Polie Region column was added 
police_division_df = df_list[1].loc[df_list[1].Year > 2021]
print(f"table shape: {police_division_df.shape}")
police_division_df.head() 

table shape: (5017, 10)


Unnamed: 0,Year,Year ending,Police Service Area,Local Government Area,Offence Division,Offence Subdivision,Offence Subgroup,Incidents Recorded,"PSA Rate per 100,000 population","LGA Rate per 100,000 population"
0,2022,March,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089
1,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A211 FV Serious assault,120,97.418429,103.874128
2,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,147,119.337575,127.245807
3,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,"A22 Assault police, emergency services or othe...",53,43.026473,45.87774
4,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A231 FV Common assault,222,180.224093,192.167137


Create investigation_status_df from Table5 which is in Local Government Region with 4 Investigation Status

In [8]:
investigation_status_df = df_list[4].loc[df_list[4].Year > 2018].drop('Year ending',axis=1)
print(f"table shape: {investigation_status_df.shape}")
investigation_status_df.head()

table shape: (948, 4)


Unnamed: 0,Year,Local Government Area,Charge Status,Incidents Recorded
0,2022,Alpine,Charges laid,149
1,2022,Alpine,No charges laid,114
2,2022,Alpine,Unsolved,108
3,2022,Ararat,Charges laid,467
4,2022,Ararat,No charges laid,217


In [9]:
investigation_status_df.dtypes 

Year                      int64
Local Government Area    object
Charge Status            object
Incidents Recorded        int64
dtype: object

Create an Offence_division_summary_df which summarises the offence division category in Table 02 by LGA for 2019-2022 - this will enable trending of incicents over the 4 years by this category. More granualr exploration by other subsets is not possible due to limitations around size and number of rows in Heroku 

In [10]:
offence_division_summary_df = df_list[5]
#     .drop('Year ending',axis=1)
offence_division_summary_df.head()

Unnamed: 0,Row Labels,A Crimes against the person,B Property and deception offences,C Drug offences,D Public order and security offences,E Justice procedures offences,F Other offences
0,Alpine,395,668,87,165,225,68
1,Ararat,873,1498,276,309,805,93
2,Ballarat,4882,19408,743,1260,4832,497
3,Banyule,3740,14155,1078,1043,5265,517
4,Bass Coast,1911,3793,297,378,1282,151


In [11]:
offence_division_summary_df.dtypes 

Row Labels                              object
A Crimes against the person              int64
B Property and deception offences        int64
C Drug offences                          int64
D Public order and security offences     int64
E Justice procedures offences            int64
F Other offences                         int64
dtype: object

In [12]:
#Convert police_region_df to csv
police_region_df.to_csv("police_region.csv")

In [13]:
csv_file = "police_region.csv"
police_region_df = pd.read_csv(csv_file)
police_region_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,0,2022,1 North West Metro,Banyule,5244,4086.865525
1,1,2022,1 North West Metro,Brimbank,12330,6200.95948
2,2,2022,1 North West Metro,Darebin,9276,5751.475727
3,3,2022,1 North West Metro,Hobsons Bay,4599,4804.896128
4,4,2022,1 North West Metro,Hume,12196,4919.684615


In [14]:
#Convert police_division_df to csv
police_division_df.to_csv("police_division.csv")

In [15]:
csv_file = "police_division.csv"
police_division_df = pd.read_csv(csv_file)
police_division_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Year ending,Police Service Area,Local Government Area,Offence Division,Offence Subdivision,Offence Subgroup,Incidents Recorded,"PSA Rate per 100,000 population","LGA Rate per 100,000 population"
0,0,2022,March,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089
1,1,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A211 FV Serious assault,120,97.418429,103.874128
2,2,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,147,119.337575,127.245807
3,3,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,"A22 Assault police, emergency services or othe...",53,43.026473,45.87774
4,4,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A231 FV Common assault,222,180.224093,192.167137


In [33]:
investigation_status_df.to_csv("investigation_status.csv")

In [34]:
csv_file = "investigation_status.csv"
investigation_status_df = pd.read_csv(csv_file)
investigation_status_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Year,Local Government Area,Charge Status,Incidents Recorded
0,0,0,2022,Alpine,Charges laid,149
1,1,1,2022,Alpine,No charges laid,114
2,2,2,2022,Alpine,Unsolved,108
3,3,3,2022,Ararat,Charges laid,467
4,4,4,2022,Ararat,No charges laid,217


In [35]:
offence_division_summary_df.to_csv("offence_division_summary.csv")

In [36]:
csv_file = "offence_division_summary.csv"
offence_division_summary_df = pd.read_csv(csv_file)
offence_division_summary_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Row Labels,A Crimes against the person,B Property and deception offences,C Drug offences,D Public order and security offences,E Justice procedures offences,F Other offences
0,0,0,Alpine,395,668,87,165,225,68
1,1,1,Ararat,873,1498,276,309,805,93
2,2,2,Ballarat,4882,19408,743,1260,4832,497
3,3,3,Banyule,3740,14155,1078,1043,5265,517
4,4,4,Bass Coast,1911,3793,297,378,1282,151


In [37]:
from sqlalchemy import create_engine
from db_conn import DB_conn


In [38]:
#connect to local database
# rds_connection_string = "postgres:Fedelma22!@localhost:5432/Project3"
engine = create_engine (DB_conn)
# DB_conn = 'postgresql://dzvvuadeabjykv:52e20e27a11369714177a77897b26b72f75c29f0320ee9b3114a7b21c5a4edd4@ec2-34-199-68-114.compute-1.amazonaws.com:5432/ddthf1b3u3h6eu'

In [39]:
engine.table_names()

  """Entry point for launching an IPython kernel.


['INVESTGATIONSTATUS', 'CHARGESTATUS', 'DIVISIONSUMMARY', 'REGION', 'DIVISION']

In [40]:
police_region_df.to_sql(name='REGION', con=engine, if_exists='replace', index=False)

In [41]:
pd.read_sql_query('select * from "REGION"', con=engine).head()

Unnamed: 0.1,Unnamed: 0,Year,Police Region,Local Government Area,Incidents Recorded,"Rate per 100,000 population"
0,0,2022,1 North West Metro,Banyule,5244,4086.865525
1,1,2022,1 North West Metro,Brimbank,12330,6200.95948
2,2,2022,1 North West Metro,Darebin,9276,5751.475727
3,3,2022,1 North West Metro,Hobsons Bay,4599,4804.896128
4,4,2022,1 North West Metro,Hume,12196,4919.684615


In [29]:
police_division_df.to_sql(name='DIVISION', con=engine, if_exists='replace', index=False)

In [42]:
pd.read_sql_query('select * from "DIVISION"', con=engine).head()

Unnamed: 0.1,Unnamed: 0,Year,Year ending,Police Service Area,Local Government Area,Offence Division,Offence Subdivision,Offence Subgroup,Incidents Recorded,"PSA Rate per 100,000 population","LGA Rate per 100,000 population"
0,0,2022,March,Ballarat,Ballarat,A Crimes against the person,A10 Homicide and related offences,A10 Homicide and related offences,5,4.059101,4.328089
1,1,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A211 FV Serious assault,120,97.418429,103.874128
2,2,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A212 Non-FV Serious assault,147,119.337575,127.245807
3,3,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,"A22 Assault police, emergency services or othe...",53,43.026473,45.87774
4,4,2022,March,Ballarat,Ballarat,A Crimes against the person,A20 Assault and related offences,A231 FV Common assault,222,180.224093,192.167137


In [23]:
charge_status_df.to_sql(name='CHARGESTATUS', con=engine, if_exists='replace', index=False)

In [43]:
pd.read_sql_query('select * from "CHARGESTATUS"', con=engine).head()

Unnamed: 0.1,Unnamed: 0,Year,Local Government Area,Charge Status,Incidents Recorded
0,0,2022,Alpine,Charges laid,149
1,1,2022,Alpine,No charges laid,114
2,2,2022,Alpine,Unsolved,108
3,3,2022,Ararat,Charges laid,467
4,4,2022,Ararat,No charges laid,217


In [58]:
offence_division_summary_df.to_sql(name='DIVISIONSUMMARY', con=engine, if_exists='replace', index=False)

In [44]:
pd.read_sql_query('select * from "DIVISIONSUMMARY"', con=engine).head()

Unnamed: 0.1,Unnamed: 0,"Year,Local Government Area,A Crimes against the person,B Proper"
0,0,"2019,Alpine,114,161,26.0,48,50,0,"
1,1,"2019,Ararat,223,376,65.0,81,165,2,"
2,2,"2019,Ballarat,1372,5061,204.0,372,1105,15,"
3,3,"2019,Banyule,957,4063,223.0,309,1442,11,"
4,4,"2019,Bass Coast,376,992,78.0,119,216,2,"
