## Chicago portal taxi data EDA & transformation parts
### Exploratory Data Analysis (Felfedező Adatelemzés) 

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 30)
import requests
%run "00_common_res.ipynb"
import os
%run func/df_to_csv.ipynb

# from datetime import datetime
# from  dateutil.relativedelta import relativedelta
# formatted_date = datetime.now().date() - relativedelta(months=2)

In [None]:
# get 1 day trip data from protal by API

url_chportal_taxi_api = (f"https://data.cityofchicago.org/resource/ajtu-isnz.json?"
    f"$where=trip_start_timestamp >= '{formatted_date}T00:00:00' "
    f"AND trip_start_timestamp <= '{formatted_date}T23:59:59'&$limit=30000")

res_chportal_api = requests.get(url_chportal_taxi_api)
res_chportal_api


In [None]:
# format to json
js_taxi_data = res_chportal_api.json()

len(js_taxi_data)
js_taxi_data

In [None]:
# last run size: 23017 record
# print (len(js_taxi_data))

In [None]:
# convert json format data to dataFrame
df_taxi_data =  pd.DataFrame (js_taxi_data)
# save memory original memory alocaion
i_df_taxi_data = int(df_taxi_data.memory_usage(deep=True).sum()/1024)

df_taxi_data.head()

In [None]:
df_taxi_data [df_taxi_data["fare"].isna()].sample(8)

### Trasfomation part

In [None]:
# drop unnecessary columns
df_taxi_data.drop(["pickup_census_tract", "dropoff_census_tract"], axis=1, inplace=True)
df_taxi_data.drop(["pickup_centroid_location", "dropoff_centroid_location"], axis=1, inplace=True)

df_taxi_data.info()

In [None]:
# drop rows that have missings
df_taxi_data.dropna (inplace=True)

In [None]:
# renaming cols
di_col_old_new = {
    "pickup_community_area": "pickup_community_area_id", 
    "dropoff_community_area": "dropoff_community_area_id"
    }
df_taxi_data.rename (columns=di_col_old_new, inplace=True)

df_taxi_data.columns

In [None]:
# create helper column
df_taxi_data["datetime_for_weather"] = pd.to_datetime(df_taxi_data["trip_start_timestamp"])

#### Dataframe column types definitions/conversions

In [None]:
# typing help :-)
df_col_types = df_taxi_data.dtypes
for s_colname in df_col_types.keys():
    print(f'"{s_colname}": "{df_col_types[s_colname]}",')

In [None]:
# create dict with types def
di_df_col_type = {
    "trip_id": "object",
    "taxi_id": "object",
    "trip_start_timestamp": "datetime64[ns]",
    "trip_end_timestamp": "datetime64[ns]",
    "trip_seconds": "int32",
    "trip_miles": "float64",
    "pickup_community_area_id": "int8",
    "dropoff_community_area_id": "int8",
    "fare": "float64",
    "tips": "float64",
    "tolls": "float64",
    "extras": "float64",
    "trip_total": "float64",
    "payment_type": "object",
    "company": "object",
    "pickup_centroid_latitude": "object",
    "pickup_centroid_longitude": "object",
    "dropoff_centroid_latitude": "object",
    "dropoff_centroid_longitude": "object",
    "datetime_for_weather": "datetime64[ns]"
}

# apply to dataFrame
df_taxi_data = df_taxi_data.astype(di_df_col_type)

df_taxi_data.info()


In [None]:
# time rounding
df_taxi_data["datetime_for_weather"] = df_taxi_data["datetime_for_weather"].dt.floor("h")

df_taxi_data.sample(5)

#### Testing the joining trips and weather data

In [None]:
# get meteto data
url = "https://archive-api.open-meteo.com/v1/era5"

di_params = {
"latitude": 41.85,
"longitude": -87.65,
"start_date": formatted_date,
"end_date": formatted_date,
"hourly": "temperature_2m,wind_speed_10m,rain,precipitation"
}

response = requests.get(url, params=di_params)
js_weather_data = response.json()

# selected data to dataFrame
di_meteo_data_filt = {
    "datetime": js_weather_data["hourly"]["time"],
    "tempretaure": js_weather_data["hourly"]["temperature_2m"],
    "wind_speed": js_weather_data["hourly"]["wind_speed_10m"],
    "rain": js_weather_data["hourly"]["rain"],
    "precipitation": js_weather_data["hourly"]["precipitation"]
}
df_meteo_data_filt = pd.DataFrame(di_meteo_data_filt)

# convert type to datetime
df_meteo_data_filt['datetime'] = pd.to_datetime(df_meteo_data_filt['datetime'])

df_meteo_data_filt.head(2)

In [None]:
# join meteo to taxi frame
df_taxi_meteo_data = df_taxi_data.merge(df_meteo_data_filt, left_on='datetime_for_weather', right_on='datetime')

df_taxi_meteo_data.head(3)

#### Sanity cheks

In [None]:
# memory usage check
print ("df_taxi_data creation memory usage:", i_df_taxi_data, "Kbyte")
print ("df_taxi_data cleaned, optimised memory usage:", int(df_taxi_data.memory_usage(deep=True).sum()/1024), "Kbyte")

In [None]:
# trip end time check
df_taxi_data [df_taxi_data ['trip_end_timestamp'] == df_taxi_data ['trip_end_timestamp'].max()]

In [None]:
# longest trip time 
df_taxi_data [df_taxi_data ['trip_seconds'] == df_taxi_data ['trip_seconds'].max()]

In [None]:
# max fares
df_taxi_data [df_taxi_data ['fare'] == df_taxi_data ['fare'].max()]

In [None]:
# n largest fares
df_taxi_data.nlargest(5, "fare")

In [None]:
# examine the column data
print(df_taxi_data['payment_type'].info())
df_taxi_data['payment_type'].unique()

In [None]:
# First test
# create sorted unique extract
df_payment_types = df_taxi_data['payment_type'].drop_duplicates()
df_payment_types.sort_values(ignore_index=True) 

# create dim table
df_payment_types = pd.DataFrame(
    {
        'payment_type_id': range(1, len(df_payment_types)+1), # create range for ID
        'payment_type': df_payment_types

    }
)

df_payment_types

In [None]:
# define column & dim table names to generate
ls_dim_names = [
        ['payment_type', 'df_payment_types'],
        ['company', 'df_companys']
    ]

# craete dim tables
for ls_dim_names_row in ls_dim_names:
    # generate unique extracts
    df_dim_tmp = df_taxi_data[ls_dim_names_row[0]].drop_duplicates()
    df_dim_tmp = df_dim_tmp.sort_values(ignore_index=True)

    # create dim table
    df_dim_tmp = pd.DataFrame(
        {
            f'{ls_dim_names_row[0]}_id': range(1, len(df_dim_tmp)+1), # create range for ID
            ls_dim_names_row[0]: df_dim_tmp

        }
    )
    # rename dataFrame
    globals()[ls_dim_names_row[1]] = df_dim_tmp
    s_path_dim = s_path_sep.join([s_base_path, s_dir_data, ls_dim_names_row[1]+'.csv'])
    df_export_to_csv(df_dim_tmp, s_path_dim, s_dir_data)
