In [3]:
import pandas as pd
import numpy as np
import requests
import os

import mlflow

from datetime import datetime, time

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, RegressionPreset, DataQualityPreset
from sklearn.model_selection import train_test_split

from dotenv import load_dotenv

import datetime
import time
import random

```RegressionPreset```: **Prediction** and **target** are required. **Input features** are optional  
```DataDriftPreset```: **Input features** are required + **refernce dataset** is needed  
```DataQualityPreset```: **Input features** are required. **Prediction** and **target** are optional

In [4]:
load_dotenv()

True

In [3]:
df = pd.read_csv('s3://mlops-personal-project/Traning-Data/202301-capitalbikeshare-tripdata.csv')

In [2]:
df.head()

NameError: name 'df' is not defined

# Preprocessing

In [5]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    
    All args must be of equal length.    
    
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6378.137 * c
    return km

def preprocessing(df):
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])
    
    df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60
    df = df[df['duration']>0]
        

    df['started_day'] = df['started_at'].dt.day
    df['started_hour'] = df['started_at'].dt.hour
    df['ended_day'] = df['ended_at'].dt.day
    df['ended_hour'] = df['ended_at'].dt.hour

    
    Q1 = df['duration'].quantile(0.25)
    Q3 = df['duration'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Remove outliers
    df = df[(df['duration'] >= lower_bound) & (df['duration'] <= upper_bound)]

    df['distance'] = haversine_np(df['start_lng'],df['start_lat'], df['end_lng'],df['end_lat'])
    
    categ_clumns = ['rideable_type', 'member_casual']
    df = pd.get_dummies(df, columns=categ_clumns, drop_first=True)

    dummy_columns = ['rideable_type_docked_bike', 'rideable_type_electric_bike','member_casual_member']
    features = ['duration', 'started_day', 'started_hour', 'ended_day', 'ended_hour','distance', 'rideable_type_docked_bike', 'rideable_type_electric_bike','member_casual_member']

    # Add missing dummy columns with 0s
    for col in dummy_columns:
        if col not in df.columns:
            df[col] = False

    # Remove extra columns
    extra_columns = [col for col in df.columns if col not in features]
    df.drop(columns=extra_columns, inplace=True)
    
    df.dropna(inplace=True)

    dummy_columns = df.columns
    # print(dummy_columns)

    return df

# Preparing Reference Dataset

In [6]:
reference_df = pd.read_csv('s3://mlops-personal-project/Traning-Data/202401-capitalbikeshare-tripdata.csv')
reference_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519.0,4th & College St NW,31138.0,38.908643,-77.012365,38.921233,-77.018135,member
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519.0,10th & G St NW,31274.0,38.90869,-77.012317,38.898243,-77.026235,casual


In [7]:
processed_reference_df = reference_df.copy()
processed_reference_df = preprocessing(processed_reference_df)
processed_reference_df.head()

Unnamed: 0,duration,started_day,started_hour,ended_day,ended_hour,distance,rideable_type_electric_bike,member_casual_member,rideable_type_docked_bike
0,2.6,25,15,25,15,0.538003,False,True,False
1,8.45,2,16,2,16,1.487956,False,True,False
2,3.666667,24,15,24,15,0.538003,False,True,False
3,2.583333,4,15,4,15,0.538003,False,True,False
4,7.7,5,12,5,12,1.675201,True,False,False


In [8]:
processed_reference_df.to_parquet('./processed_reference_df.parquet')

In [None]:
processed_reference_df = pd.read_parquet('s3://mlops-personal-project/reference_dataset/processed_reference_df.parquet')
processed_reference_df.head()

# Train Dataset

In [6]:
processed_df = df.copy()
processed_df = preprocessing(processed_df)
processed_df.head()

Unnamed: 0,duration,started_day,started_hour,ended_day,ended_hour,distance,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_member
0,5.366667,4,19,4,19,0.909569,False,False,True
2,6.666667,5,20,5,20,0.326598,False,False,True
3,12.15,3,17,3,17,1.132374,False,False,True
4,7.066667,3,5,3,5,1.132374,False,False,True
5,26.633333,11,16,11,17,2.008963,False,False,False


In [7]:
X = processed_df.drop('duration', axis=1)
y = processed_df['duration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Preparing the data for Evidently

In [8]:
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)

TEST_RUN_ID = os.getenv("TEST_RUN_ID")
model = mlflow.pyfunc.load_model(f"runs:/{TEST_RUN_ID}/model")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

 - mlflow (current: 2.11.2, required: mlflow==2.11.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [21]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numerical_features)
# Identifying categorical features
categorical_features = X_train.select_dtypes(include=['object','bool','category']).columns.tolist()
print(categorical_features)

['distance']
['rideable_type_docked_bike', 'rideable_type_electric_bike', 'member_casual_member']


In [22]:
predictions = model.predict(X_test)

# Preparing the data for Evidently Profile
df_test = X_test.copy()
df_test['target'] = y_test
df_test['prediction'] = predictions

In [23]:
column_mapping = ColumnMapping()

column_mapping.target = 'target'
column_mapping.prediction = 'prediction'
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features

In [24]:
regression_performance = Report(metrics=[RegressionPreset(), DataQualityPreset(), DataDriftPreset()])

In [25]:
regression_performance.run(current_data=df_test, reference_data=None, column_mapping=column_mapping)



In [26]:
regression_performance.show()

ValueError: Reference dataset should be present

In [28]:
regression_performance.as_dict()

{'metrics': [{'metric': 'RegressionQualityMetric',
   'result': {'columns': {'utility_columns': {'date': None,
      'id': None,
      'target': 'target',
      'prediction': 'prediction'},
     'num_feature_names': ['distance'],
     'cat_feature_names': ['rideable_type_docked_bike',
      'rideable_type_electric_bike',
      'member_casual_member'],
     'text_feature_names': [],
     'datetime_feature_names': [],
     'target_names': None},
    'current': {'r2_score': -0.09732581250033445,
     'rmse': 7.347858959642842,
     'mean_error': 2.9117215218095787,
     'mean_abs_error': 4.68002821819212,
     'mean_abs_perc_error': 208.92889175674694,
     'abs_error_max': 194.71315381871062,
     'underperformance': {'majority': {'mean_error': 2.59311952318593,
       'std_error': 3.513319333986758},
      'underestimation': {'mean_error': -10.14547451783948,
       'std_error': 4.646825175755383},
      'overestimation': {'mean_error': 21.701887390792475,
       'std_error': 11.8741481

In [30]:
regression_performance.as_dict()['metrics'][0]['result']['current']

{'r2_score': -0.08126437674875464,
 'rmse': 7.289429932148033,
 'mean_error': 2.89001454488873,
 'mean_abs_error': 4.660604106728848,
 'mean_abs_perc_error': 208.37127353739427,
 'abs_error_max': 199.81315381871062,
 'underperformance': {'majority': {'mean_error': 2.5797475766335283,
   'std_error': 3.5092247041261335},
  'underestimation': {'mean_error': -10.110700230980159,
   'std_error': 4.511286788058289},
  'overestimation': {'mean_error': 21.473717424191058,
   'std_error': 11.723998078811915}},
 'error_std': 6.692144452178647,
 'abs_error_std': 5.604943777298241,
 'abs_perc_error_std': 24.940222603813606}

# Exploring

In [31]:
# data for only a specific day


## What the report would look like if i don't have the actual values:

In [14]:
column_mapping = ColumnMapping()

column_mapping.target = None
column_mapping.prediction = 'prediction'
column_mapping.numerical_features = numerical_features
column_mapping.categorical_features = categorical_features


In [None]:
regression_performance = Report(metrics=[RegressionPreset()])
regression_performance.run(current_data=df_test, reference_data=None, column_mapping=column_mapping)