In [1]:
import pandas as pd
import numpy as np
from time import time

In [2]:
dfvc_path = '../out/vc.csv'
dfdv_path = '../out/dv.csv'
event_path = '../data/event_data.csv'

In [3]:
df = pd.DataFrame.append(pd.read_csv(dfdv_path), pd.read_csv(dfvc_path), ignore_index=True ,sort=False)

In [4]:
# Event dimension
parse_dates = ['event_begin_time', 'event_end_time']
event_dimension_df = pd.read_csv(event_path, parse_dates=parse_dates)

# Give 0 for unknown size
event_dimension_df['event_location_size'] = event_dimension_df['event_location_size'].apply(lambda x: 0 if np.isnan(x) else x)

In [5]:
date_dimension_df = df[['crime_date', 'day_of_week', 'week_of_year', 'quarter', 'weekend',
                        'holiday', 'holiday_name']].drop_duplicates().reset_index(drop=True)
date_dimension_df['date_key'] = date_dimension_df.index + 1

location_dimension_df = df[['longitude', 'latitude', 'city', 'neighbourhood', 'address',
                        'crime_rate']].drop_duplicates().reset_index(drop=True)
location_dimension_df['location_key'] = location_dimension_df.index + 1

crime_dimension_df = df[['crime_category', 'crime_type', 'first_occurrence_time',
                         'last_occurrence_time', 'reported_time',
                         'crime_severity']].drop_duplicates().reset_index(drop=True)
crime_dimension_df['crime_key'] = crime_dimension_df.index + 1

weather_dimension_df = df[['temperature', 'weather_main'
                           , 'weather_description', 'humidity']].drop_duplicates().reset_index(drop=True)
weather_dimension_df['weather_key'] = weather_dimension_df.index + 1

In [6]:
def get_idx_crime_dmsn(crime_category, crime_type, first_occurrence_time, last_occurrence_time, reported_time, crime_severity):
    return crime_dimension_df.loc[(crime_dimension_df['crime_category']==crime_category) &
                            (crime_dimension_df['crime_type']==crime_type)&
                            (crime_dimension_df['first_occurrence_time']==first_occurrence_time)&
                            (crime_dimension_df['last_occurrence_time']==last_occurrence_time)&
                            (crime_dimension_df['reported_time']==reported_time)&
                            (crime_dimension_df['crime_severity']==crime_severity)]['crime_key'].iloc[0]
    
def get_idx_date_dmsn(crime_date, day_of_week, week_of_year, quarter, weekend, holiday, holiday_name):
    return date_dimension_df.loc[(date_dimension_df['crime_date']==crime_date) &
                            (date_dimension_df['day_of_week']==day_of_week)&
                            (date_dimension_df['week_of_year']==week_of_year)&
                            (date_dimension_df['quarter']==quarter)&
                            (date_dimension_df['weekend']==weekend)&
                            (date_dimension_df['holiday']==holiday)&
                            (date_dimension_df['holiday_name']==holiday_name)]['date_key'].iloc[0]
    
def get_idx_location_dmsn(longitude, latitude, city, neighbourhood, address, crime_rate):
    return location_dimension_df.loc[(location_dimension_df['longitude']==longitude) &
                            (location_dimension_df['latitude']==latitude)&
                            (location_dimension_df['city']==city)&
                            (location_dimension_df['neighbourhood']==neighbourhood)&
                            (location_dimension_df['address']==address)&
                            (location_dimension_df['crime_rate']==crime_rate)]['location_key'].iloc[0]
    
def get_idx_weather_dmsn(temperature, weather_main, weather_description, humidity):
    return weather_dimension_df.loc[(weather_dimension_df['temperature']==temperature) &
                            (weather_dimension_df['weather_main']==weather_main)&
                            (weather_dimension_df['weather_description']==weather_description)&
                            (weather_dimension_df['humidity']==humidity)]['weather_key'].iloc[0]

# TODO: time -> date
def get_idx_event_dmsn(date, city) -> int:
    tmp = event_dimension_df.loc[(event_dimension_df['event_begin_time']<=date)&
                                 (event_dimension_df['event_end_time']>=date)&
                                 (event_dimension_df['city']==city)]['event_key']
    if tmp.size != 0:
        return tmp.iloc[0]
    else:
        return 0

In [None]:
fact_df = pd.DataFrame()
start=time()
fact_df['crime_key'] = df.apply(lambda x: 
               get_idx_crime_dmsn(x.crime_category, x.crime_type, x.first_occurrence_time,
                                  x.last_occurrence_time, x.reported_time, x.crime_severity), axis=1)

fact_df['date_key'] = df.apply(lambda x: 
               get_idx_date_dmsn(x.crime_date, x.day_of_week, x.week_of_year,
                                 x.quarter, x.weekend, x.holiday, x.holiday_name), axis=1)

fact_df['location_key'] = df.apply(lambda x: 
               get_idx_location_dmsn(x.longitude, x.latitude, x.city, x.neighbourhood, x.address, x.crime_rate), axis=1)

fact_df['weather_key'] = df.apply(lambda x: 
               get_idx_weather_dmsn(x.temperature, x.weather_main, x.weather_description, x.humidity), axis=1)

fact_df['event_key'] = df.apply(lambda x: get_idx_event_dmsn(x.crime_date, x.city), axis=1)

print('elapsed: %s'%(time()-start))

In [None]:
fact_df.head()

In [None]:
date_dimension_df.to_csv('../out/date_dimension.csv', index=True)
location_dimension_df.to_csv('../out/location_dimension.csv', index=True)
crime_dimension_df.to_csv('../out/crime_dimension.csv', index=True)
weather_dimension_df.to_csv('../out/weather_dimension.csv', index=True)
event_dimension_df.to_csv('../out/event_dimension.csv', index=False)

fact_df.index += 1
fact_df.to_csv('../out/fact.csv', index=True, index_label='fact_key')