In [None]:
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import pandas as pd
import mlflow

import pickle
import os

os.environ['HADOOP_HOME'] = '/home/hdoop/hadoop'
os.environ['ARROW_LIBHDFS_DIR'] = '/home/hdoop/hadoop/lib/native'
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("nyc-taxi-experiment")

In [None]:
## Get runs
def getRuns():
    experiments = mlflow.search_runs()
    runs = experiments[~experiments["tags.mlflow.log-model.history"].isna()]

    return runs

In [None]:
## Save Model
def saveBestModel(runs):
    client = mlflow.tracking.MlflowClient()

    best_score = runs.sort_values("metrics.rmse", ascending=False)['run_id'][0]

    client.download_artifacts(best_score, path="models_mlflow", dst_path="./models/")

In [None]:
## Load Model
def loadModel(runs):
    best_score = runs.sort_values("metrics.rmse", ascending=False)['run_id'][0]

    model_id = best_score
    logged_model = f"runs:/{model_id}/models_mlflow"

    xgboost_model = mlflow.xgboost.load_model(logged_model)

    return xgboost_model

In [None]:
def prepareData(df_test):
    print(len(df_test))

    df_test.lpep_dropoff_datetime = pd.to_datetime(df_test.lpep_dropoff_datetime)
    df_test.lpep_pickup_datetime = pd.to_datetime(df_test.lpep_pickup_datetime)

    df_test['duration'] = df_test.lpep_dropoff_datetime - df_test.lpep_pickup_datetime
    df_test.duration = df_test.duration.apply(lambda td: td.total_seconds() / 60)

    df_test = df_test[(df_test.duration >= 1) & (df_test.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df_test[categorical] = df_test[categorical].astype(str)

    df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']

    categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    with open('./models/preprocessor.pkl', 'rb') as f:
        dv = pickle.load(f)

    test_dicts = df_test[categorical + numerical].to_dict(orient='records')
    
    X_test = dv.transform(test_dicts)

    target = 'duration'
    y_test = df_test[target].values

    return xgb.DMatrix(X_test), y_test, test_dicts

df = pd.read_parquet("./data/green_tripdata_2022-02.parquet")

X_test, y_test, test_dicts = prepareData(df)

In [None]:
runs = getRuns()

model = loadModel(runs)

In [None]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

In [None]:
def prepareData(pu, do, trip_distance):
    with open('./models/preprocessor.pkl', 'rb') as f:
        dv = pickle.load(f)

    data = dv.transform({'PU_DO': str(pu) + '_' + str(do), 'trip_distance':trip_distance})
    return xgb.DMatrix(data)

In [None]:
with open('./models/preprocessor.pkl', 'rb') as f:
    dv = pickle.load(f)

In [None]:
test_data = dv.transform({'PU_DO': '43_238', 'trip_distance':1.16})
test_data = xgb.DMatrix(test_data)

In [34]:
import pandas as pd

df = pd.read_parquet("./data/green_tripdata_2022-02.parquet")
df_zone = pd.read_csv("./data/taxi_zone_lookup.csv")

In [41]:
def calculateTripDistance(df, pu, do):
    trip_distance = df[(df['PULocationID'] == pu) & (df['DOLocationID'] == do)]
    return trip_distance['trip_distance'].mean()

In [42]:
calculateTripDistance(43,238)

1.2027319587628864

In [9]:
df_zone['PULocationName'] = df_zone['Borough'] + '_' + df_zone['Zone']
df_zone['DOLocationName'] = df_zone['Borough'] + '_' + df_zone['Zone']

df_zone['PULocationID'] = df_zone['LocationID']
df_zone['DOLocationID'] = df_zone['LocationID']

df_PU = df_zone[['PULocationName', 'PULocationID']]
df_DO = df_zone[['DOLocationName', 'DOLocationID']]

In [10]:
df = df.merge(df_PU, how='left', on=['PULocationID'])
df = df.merge(df_DO, how='left', on=['DOLocationID'])

In [12]:
df_PU = df[['PULocationID', 'PULocationName']]
df_DO = df[['DOLocationID', 'DOLocationName']]

df_PU.to_csv("./data/PU.csv", index=False)
df_DO.to_csv("./data/DO.csv", index=False)

In [43]:
df_PU = pd.read_csv("./data/PU.csv")
df_DO = pd.read_csv("./data/DO.csv")

In [None]:
## PULocationID ve DOLocationID leri isimleri ile eşleştir.
## Streamlit ile bir arayüz yap.
## Sonuçları PostgreSql le yaz.
## Bu Sonuçları belki bir grafiğe dökebiliriz.