In [1]:
# imports
import sys

PATH = os.path.join(os.getcwd(), '..', '..', '.local', 'share', 'virtualenvs', 'cityspire-ds-h-NIlzhGdy', 'lib', 'python3.8', 'site-packages')
sys.path.append(PATH)

import os
import pandas as pd
import sqlalchemy

In [22]:
# reading in the data
crime_data_dict = {}
rows_to_skip_dict = {'2010': 7,
                    '2011': 7,
                    '2012': 7,
                    '2013': 10,
                    '2014': 9,
                    '2015': 10,
                    '2016': 9,
                    '2017': 10,
                    '2018': 10,
                    '2019': 8}
for year in range(2010, 2020):
    year = str(year)
    FILE_PATH = os.path.join(os.getcwd(), '..', 'raw_data', f'city_crime_data_{year}.csv')
    crime_data_dict[year] = pd.read_csv(FILE_PATH, skiprows=3, skipfooter=rows_to_skip_dict[year])
    crime_data_dict[year].columns = ['state_name', 'city_name', 'violent_crime', 'murder_and_nonnegligent_homicide',
                'rape', 'robbery', 'aggravated_assault', 'property_crime', 'burglary',
                'larceny_theft', 'motor_vehicle_theft', 'arson']

In [23]:
# reading in data to get city_id in order to merge it in
DATABASE_URL = os.getenv('PRODUCTION_DATABASE_URL')
query = '''
        SELECT Cities.city_id, RTRIM(Cities.city_name), RTRIM(States.state_name)
        FROM CITIES
        LEFT JOIN STATES ON CITIES.state_id=STATES.state_id
        '''
cities = pd.read_sql(query, DATABASE_URL)
cities.columns = ['city_id', 'city_name', 'state_name']

crime_data_raw = pd.DataFrame({'city_id': [], 'year':[], 'type': [], 'value': []})

for key, df in crime_data_dict.items():
    # data cleaning
    df['state_name'] = df['state_name'].fillna(method='ffill')
    df['state_name'] = df['state_name'].apply(lambda x: ''.join([letter for letter in x if not letter.isdigit()]))
    df['state_name'] = df['state_name'].apply(lambda x: x.capitalize())
    df['city_name'] = df['city_name'].apply(lambda x: ''.join([letter for letter in str(x) if not letter.isdigit()]))
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].fillna('n/a')
    for column in df.columns[2:]:
        crime_data_dict[f'{key}'][f'{column}'] = crime_data_dict[f'{key}'][f'{column}'].apply(lambda x: str(x))
        crime_data_dict[f'{key}'][f'{column}'] = crime_data_dict[f'{key}'][f'{column}'].apply(
            lambda x: float(x.replace(',', '')) if x != 'n/a' else x)
    
    # merging in city_id
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].merge(cities, on=['city_name', 'state_name'], how='left')
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].drop(['city_name', 'state_name'], axis=1)
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].dropna()
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].reset_index(drop=True)

    # engineering feature that gives total
    crime_data_dict[f'{key}']['total'] = float(0)
    for column in crime_data_dict[f'{key}'].columns[:-2]:
        for i in range(len(crime_data_dict[f'{key}']['total'])):
            if crime_data_dict[f'{key}'][f'{column}'][i] != 'n/a':
                crime_data_dict[f'{key}']['total'][i] += crime_data_dict[f'{key}'][f'{column}'][i]

    # adding feature for the year
    crime_data_dict[f'{key}']['year'] = str(key)

    # changing format of dataframe
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].melt(id_vars=['city_id', 'year'])
    crime_data_dict[f'{key}'].columns = ['city_id', 'year', 'type', 'value']

    # combining all of the dataframes into one
    crime_data_raw = pd.concat([crime_data_raw, crime_data_dict[f'{key}']])
    

In [24]:
crime_data_raw

Unnamed: 0,city_id,year,type,value
0,0100124,2010,violent_crime,21
1,0100460,2010,violent_crime,10
2,0100484,2010,violent_crime,0
3,0100820,2010,violent_crime,33
4,0100988,2010,violent_crime,81
...,...,...,...,...
61683,5669845,2019,total,759
61684,5676515,2019,total,94
61685,5677530,2019,total,122
61686,5683040,2019,total,158
