# Gold Stage ! Step 1 - Star  + Calendar Table Range

In [1]:
import pandas as pd
import numpy as np

# Calendar Table DIM DF

## Range of Calendar Table

In [None]:
# Get min and max for our Calendar Table
"""
print (data_frame_payments.date.min())
print (data_frame_payments.date.max())
print (data_frame_trips.start_at.min())
print (data_frame_trips.ended_at.max())

"""

In [2]:
#function to Produce the Dimension Calendar Table
def dimension_datetime_frame(start='2023-01-01', end='2023-12-31'):
    """ Return a ready  Dimension Calendar Table frame with precision of seconds"""
    df = pd.DataFrame({"DateTime": pd.date_range(start=start, end=end, freq="S")})
    df["second"] = df.DateTime.dt.second
    df["minute"] = df.DateTime.dt.minute
    df["hour"] = df.DateTime.dt.hour
    df["day"] = df.DateTime.dt.day
    df["dayofweek"] = df.DateTime.dt.dayofweek
    df["is_weekend"]= df.DateTime.dt.dayofweek > 4
    df["month"] = df.DateTime.dt.month
    df["Quarter"] = df.DateTime.dt.quarter
    df["Year"] = df.DateTime.dt.year
    return df

In [3]:
calendar_df = dimension_datetime_frame()

In [5]:
calendar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31449601 entries, 0 to 31449600
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   DateTime    datetime64[ns]
 1   second      int64         
 2   minute      int64         
 3   hour        int64         
 4   day         int64         
 5   dayofweek   int64         
 6   is_weekend  bool          
 7   month       int64         
 8   Quarter     int64         
 9   Year        int64         
dtypes: bool(1), datetime64[ns](1), int64(8)
memory usage: 2.1 GB


## Riders dim DF

In [None]:
data_dim_riders = {
    'rider_key': np.array([], dtype='int32'),
    'first_name': np.array([], dtype='str'),
    'last_name': np.array([], dtype='str'),
    'address': np.array([], dtype='str'),
    'birthday': np.array([], dtype='datetime64'),
    'account_start_date': np.array([], dtype='datetime64'),
    'account_end_date': np.array([], dtype='datetime64'),
    'is_member': np.array([], dtype='bool')
}

df_dim_riders = pd.DataFrame(data_dim_riders).set_index('rider_key')



### Load Silver data from Source

In [None]:
df_silver_rider = spark.read.format("delta")
.load("delta/silver_rider")

### Concat

In [None]:
df_dim_riders = pd.concat([df_dim_riders, df_silver_rider], ignore_index=True)

### Set index back

In [None]:
df_dim_riders = df_dim_riders.set_index('rider_key')

## Station dim DF

In [None]:

data_dim_station = {
    'station_key': np.array([], dtype='str'),
    'station_name': np.array([], dtype='str'),
    'latitude': np.array([], dtype='float64'),
    'longitude': np.array([], dtype='float64')
}

df_dim_station = pd.DataFrame(data_dim_station).set_index('station_key')


### Load Silver Data Source


In [None]:
df_silver_station = spark.read.format("delta")
.load("delta/silver_station")

### Concatenate df_silver_station_batch with df_dim_station

In [None]:
data_dim_station = pd.concat([data_dim_station, df_silver_station], ignore_index=True)

### Set the index back

In [None]:
data_dim_station = data_dim_station.set_index('station_key')

## Trip Fact DF

In [None]:
data_fact_trip = {
    'trip_key': np.array([], dtype='str'),
    'rideable_type': np.array([], dtype='str'),
    'start_date_id': np.array([], dtype='datetime64'),
    'ended_date_id': np.array([], dtype='datetime64'),
    'start_station_id': np.array([], dtype='str'),
    'end_station_id': np.array([], dtype='str'),
    'rider_id': np.array([], dtype='int64'),
    'age': np.array([], dtype='datetime64'),
    'trip_duration': np.array([], dtype='datetime64')
}

df_fact_trip = pd.DataFrame(data_fact_trip).set_index('trip_key')

### Load Silver Data

In [None]:
df_silver_trip = spark.read.format("delta")
.load("delta/silver_trip")

### Merge the trip, dimDate (twice), and dimRiders data frames based on the required columns


In [None]:
df_fact_trip_batch = pd.merge(df_silver_trip[['trip_id', 'rideable_type', 'start_at', 'ended_at', 'start_station_id', 'end_station_id', 'rider_id']],
                              calendar_df[['DateTime']],
                     left_on='start_at', right_on='DateTime', how='inner')
df_fact_trip_batch = pd.merge(df_fact_trip_batch, calendar_df[['DateTime']],
                     left_on='ended_at', right_on='DateTime', how='inner')
df_fact_trip_batch = pd.merge(df_fact_trip_batch, df_dim_riders[['rider_key', 'birthday']],
                     left_on='rider_id', right_on='rider_key', how='inner')

### Create Age Column

In [None]:
# Method 1 , wrong ! gives wrong data for trips under 1 year, ### FIX ME
"""
df_fact_trip_batch['age'] = (df_fact_trip_batch['start_at'] - df_fact_trip_batch['birthday']).astype('<m8[Y]')
"""
# Method 2
# Define a UDF to calculate age from birth year
def calculate_age(birthday, start_at):
    anniversary_date = datetime(start_at.year, birthday.month, birthday.day)
    age = start_at.year - birthday.year - ((start_at < anniversary_date))
    return age

# Register the UDF
calculate_age_udf = udf(lambda birthday, start_at: calculate_age(birthday, start_at), IntegerType())

# Apply the UDF to the dob column and store the result in a new column called age
df_fact_trip_batch = df_fact_trip_batch.withColumn('age', calculate_age_udf(df_fact_trip_batch.birthday, df_fact_trip_batch.start_at))

### Create Trip Duration Column

In [None]:
df_fact_trip_batch['trip_duration'] = df_fact_trip_batch['ended_at'] - df_fact_trip_batch['start_at']

### Rename the 'DateTime' columns to 'start_date_id' and 'ended_date_id', and select the required columns:


In [None]:
df_fact_trip_batch = df_fact_trip_batch.rename(columns={'DateTime_x': 'start_date_id', 'DateTime_y': 'ended_date_id'})
df_fact_trip_batch = df_fact_trip_batch[['trip_id', 'rideable_type', 'start_date_id', 'ended_date_id', 'start_station_id', 'end_station_id', 'rider_id', 'age', 'trip_duration']]

### Concat

In [None]:
df_fact_trip = pd.concat([df_fact_trip, df_fact_trip_batch], ignore_index=True)

### Set index back

In [None]:
df_fact_trip = df_fact_trip.set_index('trip_key')

## Payments Fact DF

In [None]:
data_fact_payments = {
    'payment_id': np.array([], dtype='int64'),
    'date_id': np.array([], dtype='datetime64'),
    'rider_id': np.array([], dtype='int64'),
    'amount': np.array([], dtype='int64')
}

df_fact_payments = pd.DataFrame(data_fact_payments).set_index('payment_id')

### Load Silver Data Source

In [None]:
df_silver_payments = spark.read.format("delta")
.load("delta/silver_payments")

### Merge df_silver_payments with calendar_df

In [None]:
df_fact_payments_batch = pd.merge(df_silver_payments[['payment_id', 'date', 'rider_id', 'amount']],
                            calendar_df[['DateTime']],
                            left_on='date', right_on='DateTime', how='inner')
df_fact_payments_batch = df_fact_payments_batch[['payment_id', 'date', 'rider_id', 'amount', 'DateTime']]

### Concatenate df_fact_payments_batch with df_fact_payments

In [None]:
df_fact_payments = pd.concat([df_fact_payments, df_fact_payments_batch], ignore_index=True)

### Set the index back

In [None]:
df_fact_payments = df_fact_payments.set_index('payment_id')