## Dependencias 

In [9]:
import numpy as np
import pandas as pd 

from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR,CHAR,INTEGER,BIGINT,BOOLEAN,DATE,DATETIME,FLOAT

import json
pd.set_option('display.max_columns',None)

## Lectura de datos 

In [2]:
ruta = '/media/jose/090f6b94-de30-4aaf-9f8a-4e18b120d7f6/bd/02.  Para Ingeniería/chicago/Taxi_Trips.csv'

In [10]:
data = pd.read_csv(ruta,nrows=20000000)
data.shape

(20000000, 23)

In [11]:
data.head(2)

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,Fare,Tips,Tolls,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
0,1244583476e5bcbb73f25159e73412afb0b450a0,381b18e55254dd5635fefcf1b2a7956671bd47be71a38e...,12/29/2015 09:30:00 AM,12/29/2015 09:45:00 AM,660.0,2.4,,,8.0,7.0,8.85,2.0,0.0,0.0,10.85,Credit Card,,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.922686,-87.649489,POINT (-87.6494887289 41.9226862843)
1,e91fb843e01ba4e9d69589ce3000b5093d1d3b7f,0d9882e354e702dc4f41ab186abdbda41a960062222672...,01/02/2016 03:00:00 PM,01/02/2016 03:30:00 PM,2400.0,17.42,17031980000.0,17031080000.0,76.0,8.0,44.5,0.0,0.0,2.0,46.5,Cash,,41.979071,-87.90304,POINT (-87.9030396611 41.9790708201),41.897984,-87.641492,POINT (-87.6414915334 41.897983898)


## Conexión a BD

In [23]:
creds = json.load(open('../../creds/mysql.json','r'))

In [25]:
cnx = create_engine(f"mysql+pymysql://{creds['u']}:{creds['p']}@{creds['h']}/{creds['d']}").connect()

## Entidades

### Taxi

In [19]:
entTaxi = data[['Taxi ID']].drop_duplicates().dropna().reset_index(drop=True)

In [20]:
entTaxi['id_taxi'] = entTaxi.index+1

In [21]:
entTaxi.columns = ['hex_id','id_taxi']
entTaxi = entTaxi[['id_taxi','hex_id']]
entTaxi.head()

Unnamed: 0,id_taxi,hex_id
0,1,381b18e55254dd5635fefcf1b2a7956671bd47be71a38e...
1,2,0d9882e354e702dc4f41ab186abdbda41a960062222672...
2,3,0efcdf71809e8156defa99d541c76837db2ab944a62e41...
3,4,e1aff5afebc38449dafb6f180ceb9f5dc86cf771aeffec...
4,5,b7ac477e614f1f222f42c698e3f2841fb020060e629172...


In [22]:
entTaxi['hex_id'].map(len).max()

128

In [27]:
dtypes = [INTEGER,CHAR(128)]
entTaxi.to_sql(name='tbl_taxi',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(entTaxi.columns,dtypes)))

5937

### Compañía

In [28]:
entCompañia = data[['Company']].dropna().drop_duplicates().reset_index(drop=True)
entCompañia.shape

(47, 1)

In [30]:
entCompañia.insert(0,'id_company',entCompañia.index+1)
entCompañia.rename(columns={'Company':'name'},inplace=True)
entCompañia.head()

Unnamed: 0,id_company,name
0,1,303 Taxi
1,2,Globe Taxi
2,3,Flash Cab
3,4,Taxi Affiliation Services
4,5,Taxicab Insurance Agency Llc


In [32]:
entCompañia['name'].map(len).max()

36

In [33]:
dtypes = [INTEGER,VARCHAR(40)]
entCompañia.to_sql(name='tbl_company',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(entCompañia.columns,dtypes)))

47

### Medio de Pago

In [35]:
entMP = data[['Payment Type']].dropna().drop_duplicates().reset_index(drop=True)
entMP.insert(0,'id_pmnt',entMP.index+1)
entMP.rename(columns={'Payment Type':'type'},inplace=True)

In [36]:
entMP.head()

Unnamed: 0,id_pmnt,type
0,1,Credit Card
1,2,Cash
2,3,Pcard
3,4,Prcard
4,5,Mobile


In [37]:
entMP['type'].map(len).max()

11

In [38]:
dtypes = [INTEGER,VARCHAR(15)]
entMP.to_sql(name='tbl_pmnt',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=5000,
               dtype=dict(zip(entMP.columns,dtypes)))

10

### Viaje

In [52]:
entViaje = data[['Trip ID','Taxi ID','Trip Start Timestamp','Trip End Timestamp',
                 'Trip Seconds','Trip Miles','Fare','Tips','Tolls','Extras','Company','Payment Type',
                 'Pickup Centroid Latitude','Pickup Centroid Longitude','Dropoff Centroid Latitude','Dropoff Centroid Longitude']].copy()
entViaje.shape

(20000000, 16)

In [53]:
for c in ['Trip Start Timestamp','Trip End Timestamp']:
    entViaje[c] = pd.to_datetime(entViaje[c],format='%m/%d/%Y %I:%M:%S %p',errors='coerce')

In [54]:
entViaje = entViaje.merge(entTaxi,left_on=['Taxi ID'],right_on=['hex_id'],how='left').drop(['Taxi ID','hex_id'],axis=1)

In [55]:
entViaje = entViaje.merge(entCompañia,left_on=['Company'],right_on=['name'],how='left').drop(['Company','name'],axis=1)

In [57]:
entViaje = entViaje.merge(entMP,left_on=['Payment Type'],right_on=['type'],how='left').drop(['Payment Type','type'],axis=1)

In [59]:
entViaje.columns = ['id_trip','ts_start','ts_end','duration','distance','fare','tips','tolls','extras',
                    'pu_lat','pu_lon','do_lat','do_lon','id_taxi','id_company','id_pmnt']

In [74]:
for i in ['id_taxi','id_company','id_pmnt']:
    print(i)
    entViaje[i] =  entViaje[i].fillna(999999).astype(int)

id_taxi
id_company
id_pmnt


In [77]:
entViaje.head(2)

Unnamed: 0,id_trip,ts_start,ts_end,duration,distance,fare,tips,tolls,extras,pu_lat,pu_lon,do_lat,do_lon,id_taxi,id_company,id_pmnt
0,1244583476e5bcbb73f25159e73412afb0b450a0,2015-12-29 09:30:00,2015-12-29 09:45:00,660.0,2.4,8.85,2.0,0.0,0.0,41.899602,-87.633308,41.922686,-87.649489,1,999999,1
1,e91fb843e01ba4e9d69589ce3000b5093d1d3b7f,2016-01-02 15:00:00,2016-01-02 15:30:00,2400.0,17.42,44.5,0.0,0.0,2.0,41.979071,-87.90304,41.897984,-87.641492,2,999999,2


In [78]:
entViaje['id_trip'].map(len).max()

40

In [80]:
dtypes = [CHAR(40)]+[DATETIME]*2 +[FLOAT]*10+[INTEGER]*3
entViaje.to_sql(name='tbl_viaje',
               index=False,
               con=cnx,
               if_exists='replace',
               chunksize=200000,
               dtype=dict(zip(entViaje.columns,dtypes)))

20000000