In [1]:
import pandas as pd
import numpy as np
from time import time

In [2]:
dfvc_path = '../out/vc.csv'
dfdv_path = '../out/dv.csv'
event_path = '../data/event_data.csv'

In [3]:
df = pd.DataFrame.append(pd.read_csv(dfdv_path), pd.read_csv(dfvc_path), ignore_index=True ,sort=False)

In [4]:
# Event dimension
parse_dates = ['event_begin_time', 'event_end_time']
event_dimension_df = pd.read_csv(event_path, parse_dates=parse_dates)

# Give 0 for unknown size
event_dimension_df['event_location_size'] = event_dimension_df['event_location_size'].apply(lambda x: 0 if np.isnan(x) else x)

In [5]:
date_dimension_df = df[['crime_date', 'day_of_week', 'week_of_year', 'quarter', 'weekend',
                        'holiday', 'holiday_name']].drop_duplicates().reset_index(drop=True)
date_dimension_df['date_key'] = date_dimension_df.index + 1

location_dimension_df = df[['longitude', 'latitude', 'city', 'neighbourhood', 'address',
                        'crime_rate']].drop_duplicates().reset_index(drop=True)
location_dimension_df['location_key'] = location_dimension_df.index + 1

crime_dimension_df = df[['crime_category', 'crime_type', 'first_occurrence_time',
                         'last_occurrence_time', 'reported_time',
                         'crime_severity']].drop_duplicates().reset_index(drop=True)
crime_dimension_df['crime_key'] = crime_dimension_df.index + 1

weather_dimension_df = df[['temperature', 'weather_main'
                           , 'weather_description', 'humidity']].drop_duplicates().reset_index(drop=True)
weather_dimension_df['weather_key'] = weather_dimension_df.index + 1

In [6]:
# Crime
crime_dmsn_dict = {}
for ctgr, tp, fot, lot, rt, cs, ckey in zip(crime_dimension_df['crime_category'].to_list(), crime_dimension_df['crime_type'].to_list(),
                      crime_dimension_df['first_occurrence_time'].to_list(), crime_dimension_df['last_occurrence_time'].to_list(),
                      crime_dimension_df['reported_time'].to_list(), crime_dimension_df['crime_severity'].to_list(),
                                        crime_dimension_df['crime_key'].to_list()):
    crime_dmsn_dict[(ctgr,tp,fot,lot,rt,cs)] = ckey
def get_idx_crime_dmsn(crime_category, crime_type, first_occurrence_time, last_occurrence_time, reported_time, crime_severity):
    return crime_dmsn_dict[(crime_category, crime_type, first_occurrence_time,
                            last_occurrence_time, reported_time, crime_severity)]

# Date
date_dmsn_dict = {}
for cd, dow, woy, q, w, h, hn, dkey in zip(date_dimension_df['crime_date'].to_list(), 
                                            date_dimension_df['day_of_week'].to_list(),
                                            date_dimension_df['week_of_year'].to_list(), 
                                            date_dimension_df['quarter'].to_list(),
                                            date_dimension_df['weekend'].to_list(), 
                                            date_dimension_df['holiday'].to_list(),
                                            date_dimension_df['holiday_name'].to_list(),
                                            date_dimension_df['date_key']):
    date_dmsn_dict[(cd, dow, woy, q, w, h, hn)] = dkey
def get_idx_date_dmsn(crime_date, day_of_week, week_of_year, quarter, weekend, holiday, holiday_name):
    return date_dmsn_dict[(crime_date, day_of_week, week_of_year, quarter, weekend, holiday, holiday_name)]

# Location
location_dmsn_dict = {}
for lo, la, c, n, a, cr, lkey in zip(location_dimension_df['longitude'].to_list(), 
                                            location_dimension_df['latitude'].to_list(),
                                            location_dimension_df['city'].to_list(), 
                                            location_dimension_df['neighbourhood'].to_list(),
                                            location_dimension_df['address'].to_list(), 
                                            location_dimension_df['crime_rate'].to_list(),
                                            location_dimension_df['location_key']):
    location_dmsn_dict[(lo, la, c, n, a, cr)] = lkey
def get_idx_location_dmsn(longitude, latitude, city, neighbourhood, address, crime_rate):
    return location_dmsn_dict[(longitude, latitude, city, neighbourhood, address, crime_rate)]

# Weather
weather_dmsn_dict = {}
for t, wm, wd, h, wkey in zip(weather_dimension_df['temperature'].to_list(), 
                                            weather_dimension_df['weather_main'].to_list(),
                                            weather_dimension_df['weather_description'].to_list(), 
                                            weather_dimension_df['humidity'].to_list(),
                                            weather_dimension_df['weather_key']):
    weather_dmsn_dict[(t, wm, wd, h)] = lkey
def get_idx_weather_dmsn(temperature, weather_main, weather_description, humidity):
    return weather_dmsn_dict[(temperature, weather_main, weather_description, humidity)]


# Event
# TODO: time -> date
def get_idx_event_dmsn(date, city) -> int:
    tmp = event_dimension_df.loc[(event_dimension_df['event_begin_date']<=date)&
                                 (event_dimension_df['event_end_date']>=date)&
                                 (event_dimension_df['city']==city)]['event_key']
    if tmp.size != 0:
        return tmp.iloc[0]
    else:
        return 0

In [7]:
fact_df = pd.DataFrame()
start=time()
fact_df['crime_key'] = df.apply(lambda x: 
               get_idx_crime_dmsn(x.crime_category, x.crime_type, x.first_occurrence_time,
                                  x.last_occurrence_time, x.reported_time, x.crime_severity), axis=1)

fact_df['date_key'] = df.apply(lambda x: 
               get_idx_date_dmsn(x.crime_date, x.day_of_week, x.week_of_year,
                                 x.quarter, x.weekend, x.holiday, x.holiday_name), axis=1)

fact_df['location_key'] = df.apply(lambda x: 
               get_idx_location_dmsn(x.longitude, x.latitude, x.city, x.neighbourhood, x.address, x.crime_rate), axis=1)

fact_df['weather_key'] = df.apply(lambda x: 
               get_idx_weather_dmsn(x.temperature, x.weather_main, x.weather_description, x.humidity), axis=1)

fact_df['event_key'] = df.apply(lambda x: get_idx_event_dmsn(x.crime_date, x.city), axis=1)

fact_df['is_nighttime'] = df.apply(lambda x: x.is_nighttime, axis=1)
fact_df['is_fatal'] = df.apply(lambda x: x.is_fatal, axis=1)
fact_df['is_traffic'] = df.apply(lambda x: x.is_traffic, axis=1)


print('elapsed: %s'%(time()-start))

elapsed: 2652.190036058426


In [8]:
event_dimension_df.drop(columns=['city'], inplace=True)

In [9]:
fact_df.head()

Unnamed: 0,crime_key,date_key,location_key,weather_key,event_key,is_nighttime,is_fatal,is_traffic
0,1,1,1,179726,205,True,False,False
1,2,2,2,179726,0,False,False,False
2,3,3,3,179726,0,False,False,False
3,4,4,4,179726,771,False,False,False
4,5,5,5,179726,567,False,False,False


In [10]:
date_dimension_df.to_csv('../out/date_dimension.csv', index=False)
location_dimension_df.to_csv('../out/location_dimension.csv', index=False)
crime_dimension_df.to_csv('../out/crime_dimension.csv', index=False)
weather_dimension_df.to_csv('../out/weather_dimension.csv', index=False)
event_dimension_df.to_csv('../out/event_dimension.csv', index=False)

fact_df.to_csv('../out/fact.csv', index=False)