In [81]:
# imports
import sys

PATH = os.path.join(os.getcwd(), '..', '..', '.local', 'share', 'virtualenvs', 'cityspire-ds-h-NIlzhGdy', 'lib', 'python3.8', 'site-packages')
sys.path.append(PATH)

import os
import pandas as pd
import sqlalchemy

In [152]:
# reading in the data
crime_data_dict = {}
rows_to_skip_dict = {'2010': 7,
                    '2011': 7,
                    '2012': 7,
                    '2013': 10,
                    '2014': 9,
                    '2015': 10,
                    '2016': 9,
                    '2017': 10,
                    '2018': 10,
                    '2019': 8}
for year in range(2010, 2020):
    year = str(year)
    FILE_PATH = os.path.join(os.getcwd(), '..', 'raw_data', f'city_crime_data_{year}.csv')
    crime_data_dict[year] = pd.read_csv(FILE_PATH, skiprows=3, skipfooter=rows_to_skip_dict[year])
    crime_data_dict[year].columns = ['state_name', 'city_name', 'violent_crime', 'murder_and_nonnegligent_homicide',
                'rape', 'robbery', 'aggravated_assault', 'property_crime', 'burglary',
                'larceny_theft', 'motor_vehicle_theft', 'arson']

In [153]:
# reading in data to get city_id in order to merge it in
DATABASE_URL = os.getenv('PRODUCTION_DATABASE_URL')
query = '''
        SELECT Cities.city_id, RTRIM(Cities.city_name), RTRIM(States.state_name)
        FROM CITIES
        LEFT JOIN STATES ON CITIES.state_id=STATES.state_id
        '''
cities = pd.read_sql(query, DATABASE_URL)
cities.columns = ['city_id', 'city_name', 'state_name']


for key, df in crime_data_dict.items():
    # data cleaning
    df['state_name'] = df['state_name'].fillna(method='ffill')
    df['state_name'] = df['state_name'].apply(lambda x: ''.join([letter for letter in x if not letter.isdigit()]))
    df['state_name'] = df['state_name'].apply(lambda x: x.capitalize())
    df['city_name'] = df['city_name'].apply(lambda x: ''.join([letter for letter in str(x) if not letter.isdigit()]))
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].fillna('n/a')
    for column in df.columns[2:]:
        crime_data_dict[f'{key}'][f'{column}'] = crime_data_dict[f'{key}'][f'{column}'].apply(lambda x: str(x))
        crime_data_dict[f'{key}'][f'{column}'] = crime_data_dict[f'{key}'][f'{column}'].apply(
            lambda x: float(x.replace(',', '')) if x != 'n/a' else x)
    
    # merging in city_id
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].merge(cities, on=['city_name', 'state_name'], how='left')
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].drop(['city_name', 'state_name'], axis=1)
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].dropna()
    crime_data_dict[f'{key}'] = crime_data_dict[f'{key}'].reset_index(drop=True)

    # engineering feature that gives total
    crime_data_dict[f'{key}']['total'] = float(0)
    for column in crime_data_dict[f'{key}'].columns[:-2]:
        for i in range(len(crime_data_dict[f'{key}']['total'])):
            if crime_data_dict[f'{key}'][f'{column}'][i] != 'n/a':
                crime_data_dict[f'{key}']['total'][i] += crime_data_dict[f'{key}'][f'{column}'][i]

In [155]:
# engineering feature that gives total
df = crime_data_dict['2018']
df

Unnamed: 0,violent_crime,murder_and_nonnegligent_homicide,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson,city_id,total
0,18,0,2,0,16,49,14,33,2,,0100124,134.0
1,19,0,1,4,14,289,42,230,17,,0100460,616.0
2,92,0,2,10,80,579,56,497,26,,0100820,1342.0
3,24,0,6,10,8,802,194,492,116,,0100988,1652.0
4,314,2,5,15,292,610,92,484,34,,0101132,1848.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6048,17,1,2,0,14,313,39,261,13,1,5669845,661.0
6049,13,0,0,0,13,64,10,48,6,0,5676515,154.0
6050,35,0,0,0,35,73,12,57,4,0,5677530,216.0
6051,3,0,0,0,3,56,19,33,4,0,5683040,118.0


In [145]:
df

Unnamed: 0,violent_crime,murder_and_nonnegligent_homicide,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson,city_id,total
0,114,4,15,27,68,1922,128,1694,100,2,0135896,0.0
2,130,1,47,3,79,132,20,84,28,12,0206520,0.0
4,0,0,0,0,0,7,1,6,0,0,0217410,0.0
5,7,0,0,0,7,20,5,12,3,0,0217740,0.0
6,49,1,3,1,44,58,11,37,10,0,0218950,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8103,9,0,4,0,5,369,75,278,16,3,5669845,0.0
8104,13,0,0,0,13,34,7,22,5,0,5676515,0.0
8105,13,0,4,1,8,48,8,40,0,0,5677530,0.0
8106,7,0,1,0,6,72,24,45,3,0,5683040,0.0
