In [83]:
import pandas as pd
import glob
import os

# set the working directory to current one
os.chdir(os.getcwd().strip('notebooks'))


# set the path to load the crime files
# creates a files variable with glob, this will be used to load the files
path = "data/raw/crime/2022/csv/"

files = glob.glob(path + "/*.csv")

print(files)
# this list is to store the files as dataframes in a list
dataframes_list = []


# process of reading and storing
for i in range(len(files)):
    temp_df = pd.read_csv(files[i])
    dataframes_list.append(temp_df)

    
# new list to store cleaned dataframes
cleaned_dataframes = []

# cleaning the columns of the files since they all have similar columns
# since family incidents only records from 2018 we do the same for the rest
# remove the total columns as they seem irrelevant

for df in dataframes_list:
    df.dropna(inplace=True)
    df['Year'] = df["Year"].astype('int')
    df_new = df[df['Year'] >= 2018]
    df_new = df_new.loc[df['Local Government Area'] != 'Total']
    cleaned_dataframes.append(df_new)
    



['data/raw/crime/2022/csv/Data_Tables_LGA_Recorded_Offences_Year_Ending_March_2022.csv', 'data/raw/crime/2022/csv/Data_Tables_LGA_Alleged_Offenders_Year_Ending_March_2022.csv', 'data/raw/crime/2022/csv/Data_Tables_LGA_Criminal_Incidents_Year_Ending_March_2022.csv', 'data/raw/crime/2022/csv/Data_Tables_LGA_Victim_Reports_Year_Ending_March_2022.csv', 'data/raw/crime/2022/csv/Data_Tables_LGA_Family_Incidents_Year_Ending_March_2022.csv']


In [84]:
cleaned_dataframes

[     Year Year ending       Police Region Local Government Area Offence Count  \
 0    2022       March  1 North West Metro               Banyule         7,191   
 1    2022       March  1 North West Metro              Brimbank        16,839   
 2    2022       March  1 North West Metro               Darebin        12,424   
 3    2022       March  1 North West Metro           Hobsons Bay         5,884   
 4    2022       March  1 North West Metro                  Hume        17,870   
 ..    ...         ...                 ...                   ...           ...   
 425  2018       March           4 Western            Surf Coast         1,142   
 426  2018       March           4 Western             Swan Hill         2,466   
 427  2018       March           4 Western           Warrnambool         3,555   
 428  2018       March           4 Western          West Wimmera           161   
 429  2018       March           4 Western          Yarriambiack           538   
 
     Rate per 

In [85]:
# assign variable names to each dataframe then rename every rate per population column


alleged_offenders = cleaned_dataframes[0]

criminal_incidents = cleaned_dataframes[1]

family_incidents = cleaned_dataframes[2]

recorded_offences = cleaned_dataframes[3]

victim_reports = cleaned_dataframes[4]

alleged_offenders.rename(columns = {'Rate per 100,000 population':'per_100,000_alleged_off'}, inplace = True)

criminal_incidents.rename(columns = {'Rate per 100,000 population':'per_100,000_crim_incidents'}, inplace = True)

family_incidents.rename(columns = {'Rate per 100,000 population':'per_100,000_fam_incidents'}, inplace = True)

recorded_offences.rename(columns = {'Rate per 100,000 population':'per_100,000_recorded_off'}, inplace = True)

victim_reports.rename(columns = {'Rate per 100,000 population':'per_100,000_vic_reports'}, inplace = True)

In [92]:
# merge the dataframes into one big one


merged_dataframes_1 = pd.merge(pd.merge(alleged_offenders,criminal_incidents,on=['Year', 'Local Government Area'], 
                                        how = 'outer'), family_incidents, on=['Year', 'Local Government Area'], 
                                        how = 'outer', suffixes=('', '_y', '_x'))
                              

final_df = pd.merge(pd.merge(merged_dataframes_1,recorded_offences,on=['Year', 'Local Government Area'], 
                                        how = 'outer'), victim_reports, on=['Year', 'Local Government Area'], 
                                        how = 'outer', suffixes=('', '_y', '_x'))

final_df.drop(final_df.filter(regex='_y$').columns, axis=1, inplace=True)

final_df.drop(final_df.filter(regex='_x$').columns, axis=1, inplace=True)

final_df.rename(columns = {'Local Government Area':'LGA code'}, inplace = True)

final_df

  final_df = pd.merge(pd.merge(merged_dataframes_1,recorded_offences,on=['Year', 'Local Government Area'],


Unnamed: 0,Year,LGA code,Offence Count,"per_100,000_alleged_off",Alleged Offender Incidents,"per_100,000_crim_incidents",Incidents Recorded,"per_100,000_fam_incidents",Victim Reports,"per_100,000_recorded_off",Year ending,Police Region,Family Incidents,"per_100,000_vic_reports"
0,2022,Banyule,7191,5604.2,2547,1985.0,5244,4086.9,2826,2202.4,March,1 North West Metro,1185,923.5
1,2022,Brimbank,16839,8468.6,5062,2545.8,12330,6201.0,7056,3548.6,March,1 North West Metro,2776,1396.1
2,2022,Darebin,12424,7703.4,4084,2532.2,9276,5751.5,5197,3222.3,March,1 North West Metro,1610,998.3
3,2022,Hobsons Bay,5884,6147.4,2005,2094.8,4599,4804.9,2679,2798.9,March,1 North West Metro,1031,1077.2
4,2022,Hume,17870,7208.5,5915,2386.0,12196,4919.7,7056,2846.3,March,1 North West Metro,3850,1553.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,2018,Surf Coast,1142,3540.9,352,1091.4,952,2951.8,657,2037.1,March,4 Western,177,548.8
391,2018,Swan Hill,2466,11878.6,1078,5192.7,1752,8439.3,924,4450.9,March,4 Western,609,2933.5
392,2018,Warrnambool,3555,10197.1,1597,4580.8,2707,7764.7,1230,3528.1,March,4 Western,627,1798.5
393,2018,West Wimmera,161,4168.8,60,1553.6,114,2951.8,70,1812.5,March,4 Western,41,1061.6


In [93]:
# get the data columns
data_columns = final_df.columns

In [94]:
data_columns = list(data_columns)

In [95]:
# remove string columns from list
to_remove = ['LGA code', 'Police Region', 'Year ending']
for item in to_remove:
    data_columns.remove(item)

In [96]:
# iterate through remaining columns and convert to numerical
for item in data_columns:
    # convert to str first
    final_df[item] = final_df[item].astype(str)
    # remove comma and convert to float
    final_df[item]=final_df[item].str.replace(',','').astype(float)

In [97]:
final_df.iloc[:,[2,4,6,8,12]]

Unnamed: 0,Offence Count,Alleged Offender Incidents,Incidents Recorded,Victim Reports,Family Incidents
0,7191.0,2547.0,5244.0,2826.0,1185.0
1,16839.0,5062.0,12330.0,7056.0,2776.0
2,12424.0,4084.0,9276.0,5197.0,1610.0
3,5884.0,2005.0,4599.0,2679.0,1031.0
4,17870.0,5915.0,12196.0,7056.0,3850.0
...,...,...,...,...,...
390,1142.0,352.0,952.0,657.0,177.0
391,2466.0,1078.0,1752.0,924.0,609.0
392,3555.0,1597.0,2707.0,1230.0,627.0
393,161.0,60.0,114.0,70.0,41.0


In [98]:
# sum all the types of criminal offences into total
final_df['total crime number'] = final_df.iloc[:,[2,4,6,8,12]].sum(axis=1)

In [100]:
# save the dataframe

final_df.to_csv("data/curated/crime/LGA_crime_clean.csv")