In [1]:
import pandas as pd
pd.set_option('display.max_columns', 30)
import requests
%run "00_common_res.ipynb"

In [None]:
"""
implementation plan

1. get data from S3
2. wheater data transfommations
3. taxi data transfommations *** done
4. update payment types table *** done
5. update companies table *** done
6. update taxi data with company and payment types ids (replace strings with id from latest tables)  *** done
7-8. upload taxi and weather data to s3
9. upload new company and payment types table to s3
"""

### get data

In [3]:
# get 1 day trip data from protal by API
url_chportal_taxi_api = (f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_date}T00:00:00' "
    f"AND trip_start_timestamp <= '{formatted_date}T23:59:59'&$limit=30000")
res_chportal_api = requests.get(url_chportal_taxi_api)
# format to json
js_taxi_data = res_chportal_api.json()
# convert json format data to dataFrame
df_taxi_data =  pd.DataFrame (js_taxi_data)

### Trasfomation part

#### taxi data transformation function

In [None]:
"""
# typing help fir dict creation:-)
df_col_types = df_taxi_data.dtypes
for s_colname in df_col_types.keys():
    print(f'"{s_colname}": "{df_col_types[s_colname]}",')
"""

In [5]:
def taxi_data_transformations(df_taxi_data):
    """ taxi_data_transformations
    ver: v1.1
    Params:
        df_taxi_data (df): original daly taxi data
    Returns:
        df: cleaned & trasform taxi data
    requirements:
        module(s): pandas
    """
    # drop unnecessary columns
    df_taxi_data.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
    df_taxi_data.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

    # drop rows that have missings
    df_taxi_data.dropna (inplace=True)

    # renaming cols
    di_col_old_new = {
        "pickup_community_area": "pickup_community_area_id", "dropoff_community_area": "dropoff_community_area_id"}
    df_taxi_data.rename (columns=di_col_old_new, inplace=True)

    # create helper column
    df_taxi_data["datetime_for_weather"] = pd.to_datetime(df_taxi_data["trip_start_timestamp"])

    # time rounding
    df_taxi_data["datetime_for_weather"] = df_taxi_data["datetime_for_weather"].dt.floor("h")

    # create dict with types def
    di_df_col_type = {
        "trip_id": "object",
        "taxi_id": "object",
        "trip_start_timestamp": "datetime64[ns]",
        "trip_end_timestamp": "datetime64[ns]",
        "trip_seconds": "int32",
        "trip_miles": "float64",
        "pickup_community_area_id": "int8",
        "dropoff_community_area_id": "int8",
        "fare": "float64",
        "tips": "float64",
        "tolls": "float64",
        "extras": "float64",
        "trip_total": "float64",
        "payment_type": "object",
        "company": "object",
        "pickup_centroid_latitude": "object",
        "pickup_centroid_longitude": "object",
        "dropoff_centroid_latitude": "object",
        "dropoff_centroid_longitude": "object",
        "datetime_for_weather": "datetime64[ns]"
    }
    # apply to dataFrame
    df_taxi_data = df_taxi_data.astype(di_df_col_type)
    return df_taxi_data

taxi_data_transformations(df_taxi_data)
pass

#### update master table function develop

In [6]:
def master_table_update(df_master, df_dim, s_id_name, s_column_label):
    """ compare master and dim table. Expand master with new items if necessary.
    ver: 1.0
    Params:
        df_master (df): master data table
        df_dim (df): Actual dim (extract) table from taxi data
        s_id_name (str): master table id column name
        s_column_label (str): master table label column name
    Returns:
        df: extended master table with new item(s)
    requirements:
        module(s): pandas
    """
    s_my_name = 'master_table_update'
    # we compare them with sets form
    se_dim = set(df_dim[s_column_label].to_list())
    se_master = set(df_master[s_column_label].to_list())
    # make an additive list
    ls_dim = list(se_dim - se_master)
    # if there is'nt new element(s), return with original dataFrame
    if not ls_dim:
        print(f'Function {s_my_name}: No new element was added to the master table')
        return df_master
    # calc new id list
    ls_master_id = list(range(len(df_master)+1,len(df_master) + len(ls_dim)+1))
    # create a dict with the lists
    di_company_add = {s_id_name: ls_master_id, s_column_label: ls_dim}
    # put a dataFrame
    df_add = pd.DataFrame(di_company_add)
    # concat them
    df_master = pd.concat([df_master, df_add], ignore_index=True)
    print(f'Function {s_my_name}: master table was updated!')
    return df_master

# alternative comparison
# pd.merge(df_companies_new, df_companies['company'], on= 'company', how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1).reset_index(drop=True)


#### create simulation data

In [7]:
# define column & dim table names to generate
ls_dim_names = [
        ['payment_type', 'df_payment_types_new'],
        ['company', 'df_companies_new']
    ]

# craete dim tables
for ls_dim_names_row in ls_dim_names:
    # generate unique extracts
    df_dim_tmp = df_taxi_data[ls_dim_names_row[0]].drop_duplicates()
    df_dim_tmp = df_dim_tmp.sort_values(ignore_index=True)

    # create dim table
    df_dim_tmp = pd.DataFrame(
        {
            f'{ls_dim_names_row[0]}_id': range(1, len(df_dim_tmp)+1), # create range for ID
            ls_dim_names_row[0]: df_dim_tmp

        }
    )
    # rename dataFrame
    globals()[ls_dim_names_row[1]] = df_dim_tmp

    
    ## save
    # s_path_dim = s_path_sep.join([s_base_path, s_dir_data, ls_dim_names_row[1]+'.csv'])
    # df_export_to_csv(df_dim_tmp, s_path_dim, s_dir_data)


In [8]:
# cat the end of master table, to simulate miss
df_companies = df_companies_new.iloc[0:24]
# make a new table that simulate actual dim status
df_companies_new = df_companies_new.drop([0,1,2]).reset_index(drop=True)
df_companies_new = df_companies_new.drop('company_id', axis=1)

In [9]:
# cat the end of master table, to simulate miss
df_payment_types = df_payment_types_new.iloc[0:3]
# make a new table that simulate actual dim status
df_payment_types_new = df_payment_types_new.drop([0,1]).reset_index(drop=True)
df_payment_types_new = df_payment_types_new.drop('payment_type_id', axis=1)

#### function testings

In [None]:
# Test companies
df_companies = master_table_update(df_master=df_companies, df_dim=df_companies_new, s_id_name='company_id', s_column_label='company')
df_companies

In [None]:
# Test payment_types
df_payment_types = master_table_update(df_master=df_payment_types, df_dim=df_payment_types_new, s_id_name='payment_type_id', s_column_label='payment_type')
df_payment_types

#### update taxi data with actual master tables, replace labels to id

In [12]:
def replace_label_to_master_id(df_data, df_master, s_join_colname):
    """ join master table id to df_data and remove original label column
    ver: 1.0
    Params:
        df_data (df): main dataFrame
        df_master (df): master table to join
        s_join_colname (str): join on column name
    Returns:
        df: main dataFrame with new id a removed label columns
    requirements:
        module(s): pandas
    """
    # join master tables data
    df_data = df_data.merge(df_master, on=s_join_colname)
    # drop label coulumns
    df_data.drop([s_join_colname], axis=1, inplace=True)
    return df_data

In [None]:
# df_taxi_data = df_taxi_data.merge(df_payment_types, on='payment_type')

df_taxi_data = replace_label_to_master_id(df_data=df_taxi_data, df_master=df_companies, s_join_colname='company')
df_taxi_data = replace_label_to_master_id(df_data=df_taxi_data, df_master=df_payment_types, s_join_colname='payment_type')

df_taxi_data.head()

#### weather transformation function

In [14]:
# get meteo data
url = "https://archive-api.open-meteo.com/v1/era5"
di_params = {
"latitude": 41.85,
"longitude": -87.65,
"start_date": formatted_date,
"end_date": formatted_date,
"hourly": "temperature_2m,wind_speed_10m,rain,precipitation"
}
response = requests.get(url, params=di_params)
js_weather_data = response.json()

In [15]:
def proc_daily_wheater_data(js_weather_data):
    """ filter, process daily meteo wheater data
    ver: 1.0
    Params:
        weather_data (json): wheater data to pocessing
    Returns:
        df: processed wheater data
    requirements:
        module(s): pandas
    """
    # selected data to dataFrame
    di_weather_data_filt = {
        "datetime": js_weather_data["hourly"]["time"],
        "tempretaure": js_weather_data["hourly"]["temperature_2m"],
        "wind_speed": js_weather_data["hourly"]["wind_speed_10m"],
        "rain": js_weather_data["hourly"]["rain"],
        "precipitation": js_weather_data["hourly"]["precipitation"]
    }
    df_wether_data = pd.DataFrame(di_weather_data_filt)
    # convert type to datetime
    df_wether_data['datetime'] = pd.to_datetime(df_wether_data['datetime'])
    return df_wether_data


In [None]:
proc_daily_wheater_data(js_weather_data)