In [None]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-09-30 08:59:15--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [ <=>                ] 273.43K  --.-KB/s    in 0.1s    

2024-09-30 08:59:16 (2.03 MB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [None]:
import pandas as pd
df = pd.read_csv('hour.csv')
print(df.head())

   instant      dteday  season  yr  mnth  hr  holiday  weekday  workingday  \
0        1  2011-01-01       1   0     1   0        0        6           0   
1        2  2011-01-01       1   0     1   1        0        6           0   
2        3  2011-01-01       1   0     1   2        0        6           0   
3        4  2011-01-01       1   0     1   3        0        6           0   
4        5  2011-01-01       1   0     1   4        0        6           0   

   weathersit  temp   atemp   hum  windspeed  casual  registered  cnt  
0           1  0.24  0.2879  0.81        0.0       3          13   16  
1           1  0.22  0.2727  0.80        0.0       8          32   40  
2           1  0.22  0.2727  0.80        0.0       5          27   32  
3           1  0.24  0.2879  0.75        0.0       3          10   13  
4           1  0.24  0.2879  0.75        0.0       0           1    1  


In [None]:
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)

In [None]:
# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

In [None]:
X.head(),y.head()

(  season yr mnth hr holiday weekday workingday weathersit  temp   atemp   hum  \
 0      1  0    1  0       0       6          0          1  0.24  0.2879  0.81   
 1      1  0    1  1       0       6          0          1  0.22  0.2727  0.80   
 2      1  0    1  2       0       6          0          1  0.22  0.2727  0.80   
 3      1  0    1  3       0       6          0          1  0.24  0.2879  0.75   
 4      1  0    1  4       0       6          0          1  0.24  0.2879  0.75   
 
    windspeed day_night  
 0        0.0     night  
 1        0.0     night  
 2        0.0     night  
 3        0.0     night  
 4        0.0     night  ,
 0    16
 1    40
 2    32
 3    13
 4     1
 Name: cnt, dtype: int64)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [None]:
# Numerical features
numerical_features = ['temp', 'hum', 'windspeed']
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])
# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

In [None]:
# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])

# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features])

# Converting it to a dataframe
X_encoded = pd.DataFrame(X_encoded,
columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features))

# Encoded categorical features + Numerical features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Random Forest :- Mean Squared Error: {mse}')
print(f'Random Forest :- R-squared: {r2}')

Random Forest :- Mean Squared Error: 1808.4074990292243
Random Forest :- R-squared: 0.9428901308176855


In [None]:
# ML Pipeline
from sklearn import set_config
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
set_config(display='diagram')# To display
final_pipeline

In [None]:
# linear reg

In [None]:
X1=X
y1=y
X.head(),y1.head()


(  yr mnth hr holiday weekday workingday      temp   atemp   hum  windspeed  \
 0  0    1  0       0       6          0  0.224490  0.2879  0.81        0.0   
 1  0    1  1       0       6          0  0.204082  0.2727  0.80        0.0   
 2  0    1  2       0       6          0  0.204082  0.2727  0.80        0.0   
 3  0    1  3       0       6          0  0.224490  0.2879  0.75        0.0   
 4  0    1  4       0       6          0  0.224490  0.2879  0.75        0.0   
 
    season_2  season_3  season_4  weathersit_2  weathersit_3  weathersit_4  \
 0       0.0       0.0       0.0           0.0           0.0           0.0   
 1       0.0       0.0       0.0           0.0           0.0           0.0   
 2       0.0       0.0       0.0           0.0           0.0           0.0   
 3       0.0       0.0       0.0           0.0           0.0           0.0   
 4       0.0       0.0       0.0           0.0           0.0           0.0   
 
    day_night_night  
 0              1.0  
 1        

In [None]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2,random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
model1 =LinearRegression()
model1.fit(X1_train, y1_train)

In [None]:
y1_pred = model1.predict(X1_test)
mse1 = mean_squared_error(y1_test, y1_pred)
r21 = r2_score(y1_test, y1_pred)
print(f'Linear  regression :- Mean Squared Error: {mse1}')
print(f'Linear regression :- R-squared: {r21}')

Linear  regression :- Mean Squared Error: 14896.15062084329
Linear regression :- R-squared: 0.5295765950245783


In [None]:
# ML Pipeline
from sklearn import set_config
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', LinearRegression())
])
set_config(display='diagram')# To display
final_pipeline

In [None]:
# MLflow Setup and Experimentation
!pip install mlflow

# Import MLflow and log the experiments
import mlflow
import mlflow.sklearn

Collecting mlflow
  Downloading mlflow-2.16.2-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.16.2 (from mlflow)
  Downloading mlflow_skinny-2.16.2-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.16.2->mlflow)
  Downloading databricks_sdk-0.33.0-py3-none-any.whl.metadata (37 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow-skinny==2.16.2->mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==2.16.2->mlflow)
  Downloading opentelemetry_api-1.2

In [None]:
# Start an MLflow run for Random Forest
with mlflow.start_run(run_name="RandomForest_Model"):
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(model, "random_forest_model")



In [None]:
# Start an MLflow run for Linear Regression
with mlflow.start_run(run_name="LinearRegression_Model"):
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("mse", mse1)
    mlflow.log_metric("r2", r21)
    mlflow.sklearn.log_model(model1, "linear_regression_model")

