In [18]:
import pandas as pd
import os
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import MinMaxScaler

import mlflow

In [2]:
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
df = pd.read_csv('s3://mlops-personal-project/Traning-Data/2023/202301-capitalbikeshare-tripdata.csv')
df.head()

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,65F0ACD101BF0D49,classic_bike,2023-01-04 19:34:07,2023-01-04 19:39:29,East Falls Church Metro / Sycamore St & 19th St N,31904.0,W Columbia St & N Washington St,32609.0,38.885321,-77.156427,38.885621,-77.166917,member
1,D75158CE73DC43F0,classic_bike,2023-01-27 15:26:38,2023-01-27 19:21:36,Carroll & Westmoreland Ave,32025.0,Fenton St & Ellsworth Dr,32036.0,38.975,-77.01121,38.997033,-77.025608,member
2,33E85889625FF7CA,classic_bike,2023-01-05 20:44:38,2023-01-05 20:51:18,15th & L St NW,31276.0,Thomas Circle,31241.0,38.903649,-77.034918,38.9059,-77.0325,member
3,E1F055A1651F47A1,classic_bike,2023-01-03 17:45:14,2023-01-03 17:57:23,Hartland Rd & Harte Pl,32255.0,Merrifield Cinema & Merrifield Town Center,32235.0,38.878601,-77.222808,38.870093,-77.22997,member
4,88CC90CEEC298BAF,classic_bike,2023-01-03 05:18:46,2023-01-03 05:25:50,Merrifield Cinema & Merrifield Town Center,32235.0,Hartland Rd & Harte Pl,32255.0,38.870093,-77.22997,38.878601,-77.222808,member


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204077 entries, 0 to 204076
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             204077 non-null  object 
 1   rideable_type       204077 non-null  object 
 2   started_at          204077 non-null  object 
 3   ended_at            204077 non-null  object 
 4   start_station_name  195428 non-null  object 
 5   start_station_id    195428 non-null  float64
 6   end_station_name    194680 non-null  object 
 7   end_station_id      194680 non-null  float64
 8   start_lat           204077 non-null  float64
 9   start_lng           204077 non-null  float64
 10  end_lat             203856 non-null  float64
 11  end_lng             203856 non-null  float64
 12  member_casual       204077 non-null  object 
dtypes: float64(6), object(7)
memory usage: 20.2+ MB


In [8]:
df.describe()

Unnamed: 0,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng
count,195428.0,194680.0,204077.0,204077.0,203856.0,203856.0
mean,31371.335464,31372.851808,38.903504,-77.030524,38.902397,-77.030256
std,252.082794,250.491407,0.024856,0.031781,0.024574,0.031553
min,31000.0,31000.0,38.782633,-77.38,38.78,-77.38
25%,31211.0,31214.0,38.89386,-77.0429,38.893028,-77.043074
50%,31278.0,31277.0,38.903165,-77.03,38.902674,-77.03
75%,31608.0,31609.0,38.913761,-77.01221,38.912644,-77.012108
max,32901.0,32901.0,39.13,-76.82,39.13,-76.82


In [17]:
print(df.value_counts(df['member_casual']),"\n")
print(df.value_counts(df['rideable_type']), '\n')

member_casual
member    141515
casual     62562
Name: count, dtype: int64 

rideable_type
classic_bike     171419
electric_bike     25814
docked_bike        6844
Name: count, dtype: int64 



In [27]:
categ_clumns = ['rideable_type', 'start_station_name', 'start_station_id', 'end_station_name','end_station_id', 'member_casual']

for i in categ_clumns:
    print(f"{i}: {df[i].nunique()}")

rideable_type: 3
start_station_name: 714
start_station_id: 712
end_station_name: 716
end_station_id: 714
member_casual: 2


# Preprocessing

In [6]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    
    All args must be of equal length.    
    
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6378.137 * c
    return km


In [20]:
def preprocessing(df):
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['ended_at'] = pd.to_datetime(df['ended_at'])
    
    df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60
    df = df[df['duration']>0]
        

    df['started_day'] = df['started_at'].dt.day
    df['started_hour'] = df['started_at'].dt.hour
    df['ended_day'] = df['ended_at'].dt.day
    df['ended_hour'] = df['ended_at'].dt.hour

    
    Q1 = df['duration'].quantile(0.25)
    Q3 = df['duration'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Remove outliers
    df_filtered = df[(df['duration'] >= lower_bound) & (df['duration'] <= upper_bound)]

    df['distance'] = haversine_np(df['start_lng'],df['start_lat'], df['end_lng'],df['end_lat'])
    
    categ_clumns = ['rideable_type', 'member_casual']
    df = pd.get_dummies(df, columns=categ_clumns, drop_first=True)

    deleted_columns=['started_at','ended_at','ride_id', 'start_lng', 'start_lat', 'end_lng', 'end_lat', 'start_station_name', 'start_station_id', 'end_station_name','end_station_id']
    df.drop(columns=deleted_columns, inplace=True)
    df.dropna(inplace=True)

    return df

In [21]:
processed_df = df.copy()
processed_df = preprocessing(processed_df)
processed_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['started_day'] = df['started_at'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['started_hour'] = df['started_at'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ended_day'] = df['ended_at'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

Unnamed: 0,duration,started_day,started_hour,ended_day,ended_hour,distance,rideable_type_docked_bike,rideable_type_electric_bike,member_casual_member
0,5.366667,4,19,4,19,0.909569,False,False,True
1,234.966667,27,15,27,19,2.750975,False,False,True
2,6.666667,5,20,5,20,0.326598,False,False,True
3,12.150000,3,17,3,17,1.132374,False,False,True
4,7.066667,3,5,3,5,1.132374,False,False,True
...,...,...,...,...,...,...,...,...,...
204072,0.450000,18,14,18,14,0.000000,False,False,False
204073,16.283333,18,18,18,19,2.627968,False,False,False
204074,2.416667,4,10,4,10,0.000000,False,False,False
204075,19.550000,11,17,11,17,3.083348,False,True,False


In [22]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 203840 entries, 0 to 204076
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     203840 non-null  float64
 1   started_day                  203840 non-null  int32  
 2   started_hour                 203840 non-null  int32  
 3   ended_day                    203840 non-null  int32  
 4   ended_hour                   203840 non-null  int32  
 5   distance                     203840 non-null  float64
 6   rideable_type_docked_bike    203840 non-null  bool   
 7   rideable_type_electric_bike  203840 non-null  bool   
 8   member_casual_member         203840 non-null  bool   
dtypes: bool(3), float64(2), int32(4)
memory usage: 8.4 MB


In [23]:
processed_df.describe()

Unnamed: 0,duration,started_day,started_hour,ended_day,ended_hour,distance
count,203840.0,203840.0,203840.0,203840.0,203840.0,203840.0
mean,17.101671,15.610601,13.89245,15.612034,14.079636,1.793853
std,90.080579,9.059831,4.773341,9.060186,4.812777,1.375488
min,0.016667,1.0,0.0,1.0,0.0,0.0
25%,6.1,7.0,10.0,7.0,10.0,0.887611
50%,10.233333,16.0,15.0,16.0,15.0,1.486619
75%,17.05,24.0,17.0,24.0,18.0,2.363547
max,23935.983333,31.0,23.0,31.0,23.0,22.653637


In [24]:
X = processed_df.drop('duration', axis=1)
y = processed_df['duration']

# Setup MLflow

In [10]:
mlflow.set_tracking_uri(uri="http://ec2-16-171-38-100.eu-north-1.compute.amazonaws.com:5000")

In [11]:
mlflow.set_experiment(experiment_name="Bike-Rides")

<Experiment: artifact_location='mlflow-artifacts:/398162422483971143', creation_time=1709724198408, experiment_id='398162422483971143', last_update_time=1709724198408, lifecycle_stage='active', name='Bike-Rides', tags={}>

## Models experiment & Log

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [28]:
models = {
    "Gradient Boosting": GradientBoostingRegressor(),
}

In [29]:
for name, model in models.items():

    with mlflow.start_run(run_name='norm & remove outliers'):

        mlflow.set_tag("Model_name", name)

        model.fit(X_train, y_train)

        predictions = model.predict(X_test)

        mse = mean_squared_error(y_test, predictions)
        
        
        if hasattr(model, 'alpha'):  # For Ridge, Lasso, ElasticNet
            mlflow.log_param("alpha", model.alpha)
        if hasattr(model, 'n_estimators'):  # For RandomForest and GradientBoosting
            mlflow.log_param("n_estimators", model.n_estimators)
        if hasattr(model, 'C'):  # For SVR
            mlflow.log_param("C", model.C)
        
        # Log the model
        mlflow.sklearn.log_model(model, name)
        
        # Log metrics
        mlflow.log_metric("mse", mse)
        
        print(f"{name}: Model trained and logged with MSE: {mse}")




Gradient Boosting: Model trained and logged with MSE: 15429.83875258497




SVR: Model trained and logged with MSE: 1846.6443418120907
