In [63]:
from __future__ import division
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta as rd
from sqlalchemy import create_engine 
from analytics_tools.analytics_tools3 import freq_discrete

In [2]:
conn_oltp= create_engine('mysql://root:123@localhost:3306/taxi_oltp').connect()
conn_olap= create_engine('mysql://root:123@localhost:3306/taxi_olap').connect()

## TD_company 

In [21]:
query = """
SELECT 
    C.company, C.comp_name, A.id_trip
FROM
    tbl_trip A
        INNER JOIN
    tbl_taxi B ON A.taxi_id = B.taxi_id
        INNER JOIN
    tbl_company C ON B.company = C.company
"""

In [22]:
%%time
df = pd.read_sql(sql=query,con=conn_oltp)

CPU times: user 438 ms, sys: 22.8 ms, total: 461 ms
Wall time: 871 ms


In [23]:
df.head()

Unnamed: 0,company,comp_name,id_trip
0,107.0,Taxi Affiliation Services,1302
1,107.0,Taxi Affiliation Services,3737
2,107.0,Taxi Affiliation Services,5047
3,107.0,Taxi Affiliation Services,7321
4,107.0,Taxi Affiliation Services,9363


In [24]:
aux = df[['company','comp_name','id_trip']].groupby(['company','comp_name']).count().sort_values('id_trip',ascending=0)

In [25]:
aux.reset_index(inplace=True)
aux['comp_name_top10'] = np.where(aux.index<10,aux.comp_name,'otro')

In [27]:
aux[['company','comp_name_top10']].to_sql(chunksize=1000,
         con=conn_olap,
         if_exists='replace',
         name='td_company',
         index=False)

## TD_mop 

In [33]:
query = """
SELECT 
    payment_type,
    1 as n
FROM
    tbl_trip
"""

In [34]:
%%time
df = pd.read_sql(sql=query,con=conn_oltp)

CPU times: user 392 ms, sys: 1.89 ms, total: 394 ms
Wall time: 556 ms


In [35]:
df.head()

Unnamed: 0,payment_type,n
0,Credit Card,1
1,Credit Card,1
2,Credit Card,1
3,Credit Card,1
4,Cash,1


In [39]:
aux = df.groupby('payment_type').count().sort_values('n',ascending=0)

In [42]:
aux.reset_index(inplace=True)
aux['mop'] = np.where(aux.index<2,aux.payment_type,'otro')

In [44]:
aux.rename(columns={'index':'id_mop'},inplace=True)

In [47]:
aux[['id_mop','payment_type','mop']].to_sql(chunksize=1000,
         con=conn_olap,
         if_exists='replace',
         name='td_mop',
         index=False)

## TD_tiempo 

In [59]:
query = """
SELECT 
    trip_start_timestamp,
    trip_end_timestamp
FROM
    tbl_trip2
"""

In [60]:
%%time
df = pd.read_sql(sql=query,con=conn_oltp)

CPU times: user 2.7 s, sys: 1.12 s, total: 3.83 s
Wall time: 9.51 s


In [62]:
pd.to_datetime(df.trip_start_timestamp).min()

Timestamp('2016-01-01 00:00:00')

In [103]:
fhi = datetime(2016,1,1,0,0)
fhf = datetime(2020,12,31,23,30)

In [104]:
lst_fh = []
while fhi<fhf:
    for _ in range(24):
        fhi+=rd(hours=1)
        lst_fh.append(fhi)

In [105]:
df = pd.DataFrame(lst_fh,columns=['id_fh'])

In [106]:
datetime.now().weekday()

2

In [107]:
df['year'] = df.id_fh.map(lambda x:x.year)
df['month'] = df.id_fh.map(lambda x:x.month)
df['day'] = df.id_fh.map(lambda x:x.day)
df['hour'] = df.id_fh.map(lambda x:x.hour)
df['weekday'] = df.id_fh.map(lambda x:x.weekday())
df['id_fh'] = df.id_fh.map(lambda x:x.strftime('%Y%m%d%H'))


In [109]:
df.to_sql(chunksize=1000,
         con=conn_olap,
         if_exists='replace',
         name='td_time',
         index=False)

## TH_trip 

In [114]:
query = """
    SELECT 
        A.trip_start_timestamp,
        A.fare AS h_fare,
        A.tips AS h_tips,
        A.tolls AS h_tolls,
        A.extras AS h_extras,
        A.payment_type,
        C.company
    FROM
        tbl_trip2 A
            INNER JOIN
        tbl_taxi B ON A.taxi_id = B.taxi_id
            INNER JOIN
        tbl_company C ON B.company = C.company
"""

In [115]:
df = pd.read_sql(sql=query,con=conn_oltp)

In [117]:
df['id_fh'] = pd.to_datetime(df.trip_start_timestamp).map(lambda x:x.strftime('%Y%m%d%H'))

In [120]:
df['h_total'] = df[[x for x in df.columns if x[:2]=='h_']].sum(axis=1)

In [122]:
df['h_n'] = 1

In [123]:
df.head()

Unnamed: 0,trip_start_timestamp,h_fare,h_tips,h_tolls,h_extras,payment_type,company,id_fh,h_total,h_n
0,2016-1-13 06:15:00,4.5,0.0,0.0,0.0,Cash,107.0,2016011306,4.5,1
1,2016-1-29 06:45:00,9.25,0.0,0.0,1.0,Cash,107.0,2016012906,10.25,1
2,2016-1-12 20:30:00,8.25,0.0,0.0,0.0,Cash,107.0,2016011220,8.25,1
3,2016-1-18 20:30:00,6.0,0.0,0.0,0.0,Cash,107.0,2016011820,6.0,1
4,2016-1-20 16:30:00,11.75,0.0,0.0,0.0,Cash,107.0,2016012016,11.75,1


In [125]:
aux = df[['id_fh','payment_type','company']+[x for x in df.columns if x[:2]=='h_']].groupby(['id_fh','payment_type','company']).sum()

In [127]:
aux.reset_index(inplace=True)

In [129]:
aux.to_sql(chunksize=1000,
         con=conn_olap,
         if_exists='replace',
         name='th_trip',
         index=False)