# Práctica Calificada 2
## Predicción de tarifas de taxis
El objetivo de esta evaluación es construir un modelo de aprendizaje que sea capaz de predecir la tarifa que cobra un taxi de acuerdo a cierta información de entrada.


In [2]:
import pandas as pd
import numpy as np
print("Pandas = ", pd.__version__)
print("Numpy = ", np.__version__)

Pandas =  1.1.5
Numpy =  1.19.5


# Obteniendo del conjunto de datos

In [3]:
pd.read_csv("./train.csv",nrows=5).head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


Tenemos las siguientes columnas

*   key: cadena que identifica de manera única a cada registro.
*   fare_amount: número real indicando el costo del taxi. Esta es la variable a predecir.
*   pickup_datetime: timestamp indicando cuando el viaje a empezado.
*   pickup_longitude: número real indicando la ubicación en longitud en donde el viaje empezó.
*   pickup_latitude: número real indicando la ubicación en latitud en donde el viaje empezó.
*   dropoff_longitude: número real indicando la ubicación en longitud en donde el viaje terminó.
*   dropoff_latitude: número real indicando la ubicación en latitud en donde el viaje terminó.
*   passenger_count: número entero indicando el número de pasajeros en el servicio de taxi.

*La columna key no es un caracterísitica que nos interese.*

**Cargamos solo las columnas que nos interesan**

In [4]:
columns = ["fare_amount","pickup_datetime","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count"]

In [5]:
df_train = pd.read_csv("./train.csv",nrows=10000000,usecols=columns)

**Uso total de la memoria al inicio**

In [6]:
memoria_inicial = df_train.memory_usage(deep=True).sum()
print(f"{memoria_inicial/1e9:0.2f}GB")

1.28GB


**Analizando el tipo de las variables**

In [7]:
df_train.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

Uso de memoria de cada columna en bytes.

In [8]:
df_train.memory_usage(deep=True)

Index                      128
fare_amount           80000000
pickup_datetime      800000000
pickup_longitude      80000000
pickup_latitude       80000000
dropoff_longitude     80000000
dropoff_latitude      80000000
passenger_count       80000000
dtype: int64

**Utilizando tipos de datos eficientes.**

In [9]:
df_train["passenger_count"] = pd.to_numeric(df_train["passenger_count"], downcast="unsigned")
df_train["fare_amount"] =  pd.to_numeric(df_train["fare_amount"], downcast="float")
df_train[["pickup_longitude","pickup_latitude"]] = df_train[["pickup_longitude","pickup_latitude"]].apply(pd.to_numeric, downcast="float")
df_train[["dropoff_longitude","dropoff_latitude"]] = df_train[["dropoff_longitude","dropoff_latitude"]].apply(pd.to_numeric, downcast="float")

In [10]:
df_train.dtypes

fare_amount          float32
pickup_datetime       object
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count        uint8
dtype: object

Uso de memoria de cada columna en bytes luego de utilizar tipos de datos más eficientes.

In [11]:
df_train.memory_usage(deep=True)

Index                      128
fare_amount           40000000
pickup_datetime      800000000
pickup_longitude      40000000
pickup_latitude       40000000
dropoff_longitude     40000000
dropoff_latitude      40000000
passenger_count       10000000
dtype: int64

Uso de memoria total luego de utilizar tipos de datos más eficientes.

In [12]:
memoria_final = df_train.memory_usage(deep=True).sum()
print(f"{memoria_final/1e9:0.2f}GB")

1.01GB


**Reducción de Memoria**

In [13]:
reduccion = (memoria_inicial-memoria_final)/memoria_inicial*100
print(f"Reducimos el uso de memoria en {reduccion:0.2f}%")

Reducimos el uso de memoria en 21.09%


In [14]:
df_train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,10000000.0,10000000.0,10000000.0,9999931.0,9999931.0,10000000.0
mean,11.52297,-67.00719,38.53317,-67.0085,38.53296,1.684793
std,9.731675,13.48837,9.221583,13.36629,9.108986,1.323423
min,-107.75,-3439.245,-3492.264,-3426.601,-3488.08,0.0
25%,6.0,-73.99207,40.73491,-73.99139,40.73403,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75316,1.0
75%,12.5,-73.96709,40.76712,-73.96368,40.7681,2.0
max,1273.31,3457.626,3344.459,3457.622,3351.403,208.0


In [15]:
df_train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 7 columns):
 #   Column             Non-Null Count     Dtype  
---  ------             --------------     -----  
 0   fare_amount        10000000 non-null  float32
 1   pickup_datetime    10000000 non-null  object 
 2   pickup_longitude   10000000 non-null  float32
 3   pickup_latitude    10000000 non-null  float32
 4   dropoff_longitude  9999931 non-null   float32
 5   dropoff_latitude   9999931 non-null   float32
 6   passenger_count    10000000 non-null  uint8  
dtypes: float32(5), object(1), uint8(1)
memory usage: 276.6+ MB


De lo anterior se puede observar que tenemos valores nulos, entonces verificamos cuando valores nulos hay por cada columna:

In [16]:
print(df_train.isnull().sum())

fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    69
dropoff_latitude     69
passenger_count       0
dtype: int64


In [17]:
# Eliminando
df_train = df_train.dropna(how='any', axis=0)

# Analizando Fare_amount

In [18]:
df_train[['fare_amount']].describe()

Unnamed: 0,fare_amount
count,9999931.0
mean,11.5229
std,9.731592
min,-107.75
25%,6.0
50%,8.5
75%,12.5
max,1273.31


Obervamos que el el minimo del monto de tarifa es negativo, veamos cuantos montos de tarifa menores o iguales a cero, tenemos: 

In [19]:
len(df_train[df_train['fare_amount'] <= 0].index)

689

In [20]:
df_train[df_train['fare_amount'] <= 0]

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2039,-2.9,2010-03-09 23:37:10 UTC,-73.789452,40.643497,-73.788666,40.641953,1
2486,-2.5,2015-03-22 05:14:27 UTC,-74.000031,40.720631,-73.999809,40.720539,1
10002,0.0,2010-02-15 14:26:01 UTC,-73.987114,40.738808,-74.005913,40.713959,1
13032,-3.0,2013-08-30 08:57:10 UTC,-73.995064,40.740753,-73.995888,40.741356,4
27891,0.0,2015-05-15 21:40:28 UTC,-74.077927,40.805714,-74.077919,40.805721,1
...,...,...,...,...,...,...,...
9891251,-5.7,2010-03-26 22:26:10 UTC,-73.989861,40.738998,-73.995941,40.744331,1
9895476,-2.5,2015-05-10 22:07:39 UTC,-73.789360,40.646481,-73.791451,40.645355,1
9914973,0.0,2010-03-10 15:45:34 UTC,-73.977402,40.763756,-74.185760,40.693432,1
9951612,-2.5,2010-03-24 22:22:10 UTC,-74.010216,40.720047,-74.010185,40.719818,5


Solo queremos los datos donde el monto de la tarifa sea mayor que cero:


In [21]:
df_train = df_train[df_train['fare_amount'] > 0]

In [22]:
df_train[['fare_amount']].describe()

Unnamed: 0,fare_amount
count,9999242.0
mean,11.52409
std,9.730171
min,0.01
25%,6.0
50%,8.5
75%,12.5
max,1273.31


# Analizando la longitud y latidud

El rango de la latidud es de -90 hasta 90 grados, mientras que el grado de la longitud es de -180 hasta los 180 grados.



In [23]:
df_train = df_train[(df_train['pickup_longitude'] >= -180) & (df_train['pickup_longitude'] <= 180)]

In [24]:
df_train = df_train[(df_train['pickup_latitude'] >= -90) & (df_train['pickup_latitude'] <= 90)]

In [25]:
df_train = df_train[(df_train['dropoff_longitude'] >= -180) & (df_train['dropoff_longitude'] <= 180)]

In [26]:
df_train = df_train[(df_train['dropoff_latitude'] >= -90) & (df_train['dropoff_latitude'] <= 90)]

In [27]:
def distancia(df):
  # Radio medio de la tierra en Km
  R = 6371.0
  # Conversion a radianes
  lt1 = np.radians(df.pickup_latitude)
  lg1 = np.radians(df.pickup_longitude)
  lt2 = np.radians(df.dropoff_latitude)
  lg2 = np.radians(df.dropoff_longitude)
  # Defirencia entre latitudes y longitudes
  dlt = lt2 - lt1
  dlg = lg2 - lg1
  # Haversine
  hav = np.sin(dlt / 2)**2 + np.cos(lt1) * np.cos(lt2) * np.sin(dlg / 2)**2
  c = 2 * np.arctan2(np.sqrt(hav), np.sqrt(1-hav))
  d = R * c
  return d

In [28]:
df_train['distancia'] = distancia(df_train)

In [29]:
df_train = df_train.drop(columns=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'])

Limpiando data para distancia igual a cero kilometros o demasiado extensas

In [30]:
df_train[['distancia']].describe()

Unnamed: 0,distancia
count,9998766.0
mean,19.6993
std,369.5763
min,0.0
25%,1.212574
50%,2.115934
75%,3.87373
max,12851.76


In [31]:
df_train = df_train[(0 < df_train['distancia']) & (df_train['distancia'] < 70)]

# Analizando pickup_datetime

In [32]:
df_train['pickup_datetime']

0          2009-06-15 17:26:21 UTC
1          2010-01-05 16:52:16 UTC
2          2011-08-18 00:35:00 UTC
3          2012-04-21 04:30:42 UTC
4          2010-03-09 07:51:00 UTC
                    ...           
9999995    2012-08-12 01:18:00 UTC
9999996    2013-08-07 10:28:00 UTC
9999997    2013-10-29 08:29:00 UTC
9999998    2012-04-07 16:41:33 UTC
9999999    2010-03-30 19:27:00 UTC
Name: pickup_datetime, Length: 9687740, dtype: object

In [33]:
df_train['pickup_datetime'] = df_train['pickup_datetime'].str.replace(" UTC", "")

In [34]:
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])

In [35]:
df_train['año'] = df_train.pickup_datetime.dt.year
df_train['mes'] = df_train.pickup_datetime.dt.month
df_train['dia'] = df_train.pickup_datetime.dt.day
df_train['hora'] = df_train.pickup_datetime.dt.hour

In [36]:
df_train = df_train.drop(columns=['pickup_datetime'])

In [37]:
df_train.head()

Unnamed: 0,fare_amount,passenger_count,distancia,año,mes,dia,hora
0,4.5,1,1.031069,2009,6,15,17
1,16.9,1,8.449763,2010,1,5,16
2,5.7,2,1.389644,2011,8,18,0
3,7.7,1,2.799485,2012,4,21,4
4,5.3,1,1.998886,2010,3,9,7


# Analizando Cantidad de pasajeros

In [38]:
df_train[['passenger_count']].describe()

Unnamed: 0,passenger_count
count,9687740.0
mean,1.685256
std,1.313988
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,208.0


Eliminamos la data que contenga cantidad de pasajeros mayor o igual a 6

In [39]:
df_train = df_train[(0 < df_train['passenger_count']) & (df_train['passenger_count'] < 6)]

# Entrenamieto

In [40]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

predictors = ['passenger_count', 'distancia', 'año', 'mes', 'dia', 'hora']
salida = 'fare_amount'

X = df_train[predictors]
y = df_train[salida]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=22)

rf = RandomForestRegressor(n_estimators=500,
                            oob_score = True,
                            random_state=1,
                            max_depth=8)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=True,
                      random_state=1, verbose=0, warm_start=False)

In [41]:
from joblib import dump, load
dump(rf, 'Modelo.joblib')

['Modelo.joblib']

In [42]:
clf = load('Modelo.joblib')
from sklearn.metrics import mean_squared_error

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print("Conjunto de entrenamiento")
print(mean_squared_error(y_train, y_pred_train))
print("Conjunto de pruebsa")
print(mean_squared_error(y_test, y_pred_test))

Conjunto de entrenamiento
17.678897608580552
Conjunto de pruebsa
18.212402904496876
