In [1]:
!python -V

Python 3.9.19


In [2]:
!pipenv --venv

/home/ubuntu/.local/share/virtualenvs/mlops_temperature_prediction-M3ZLPW1f
[0m

In [3]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import mlflow
import numpy as np
import lightgbm as lgb

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

In [5]:
input_file = "../data/london_weather.csv"
df = pd.read_csv(input_file, parse_dates=True)

In [6]:
df['date'] = pd.to_datetime(df["date"],format='%Y%m%d')
df['year'] = df['date'].dt.year
df['month'] = df["date"].dt.month.map("{:02}".format)

In [7]:
df.head()

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth,year,month
0,1979-01-01,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101900.0,9.0,1979,1
1,1979-01-02,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102530.0,8.0,1979,1
2,1979-01-03,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102050.0,4.0,1979,1
3,1979-01-04,8.0,0.0,13.0,-0.3,-2.6,-6.5,0.0,100840.0,2.0,1979,1
4,1979-01-05,6.0,2.0,29.0,5.6,-0.8,-1.4,0.0,102250.0,1.0,1979,1


In [8]:
weather_all_features = df.drop(["mean_temp", "date"], axis=1).columns.to_list()
numeric_features =  ['max_temp', 'min_temp', 'mean_temp', 'global_radiation', 'sunshine', 'cloud_cover', 'snow_depth', 'precipitation', 'pressure']
weather_features = ['month', 'max_temp', 'min_temp', 'global_radiation', 'sunshine', 'cloud_cover', 'snow_depth']
# weather_features = ['month', 'cloud_cover', 'sunshine', 'precipitation', 'pressure', 'global_radiation']
weather_target = "mean_temp"
df_weather = df.copy()
# Fill the records with "null" mean_temp with the mean value
# df_weather['mean_temp'] = df_weather.fillna(df_weather['mean_temp'].mean())['mean_temp']
# df_weather[weather_features].fillna(df_weather[weather_features].mean(), inplace=True)
[df_weather[col].fillna(df_weather[col].mean(), inplace=True) for col in numeric_features]

[None, None, None, None, None, None, None, None, None]

In [9]:
print(df_weather.isnull().sum())

date                0
cloud_cover         0
sunshine            0
global_radiation    0
max_temp            0
mean_temp           0
min_temp            0
precipitation       0
pressure            0
snow_depth          0
year                0
month               0
dtype: int64


In [10]:
# Subset feature and target sets
# X = df_weather[weather_all_features]    
X = df_weather[weather_features]  
y = df_weather[weather_target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X[0:2])
print(type(X))
print("The selected features are : ", X_train.columns)
feature_index = X_train.columns
print("X_train : ", X_train[0:2])
print("type(X_train) : ", type(X_train))
print("Original train DF : ", X_train[0:2].to_dict(orient='records'))
    
dv = DictVectorizer()
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
# Transform on the test data
X_test  = dv.transform(X_test.to_dict(orient='records'))
    
# Scale the data
scaler = StandardScaler(with_mean=False)
# Fit on the training data
X_train = scaler.fit_transform(X_train)
# Transform on the test data
X_test = scaler.transform(X_test)  

  month  max_temp  min_temp  global_radiation  sunshine  cloud_cover  \
0    01       2.3      -7.5              52.0       7.0          2.0   
1    01       1.6      -7.5              27.0       1.7          6.0   

   snow_depth  
0         9.0  
1         8.0  
<class 'pandas.core.frame.DataFrame'>
The selected features are :  Index(['month', 'max_temp', 'min_temp', 'global_radiation', 'sunshine',
       'cloud_cover', 'snow_depth'],
      dtype='object')
X_train :        month  max_temp  min_temp  global_radiation  sunshine  cloud_cover  \
256      09      17.4       4.7             157.0       7.1          4.0   
12926    05      16.2      11.2             177.0       3.9          6.0   

       snow_depth  
256           0.0  
12926         0.0  
type(X_train) :  <class 'pandas.core.frame.DataFrame'>
Original train DF :  [{'month': '09', 'max_temp': 17.4, 'min_temp': 4.7, 'global_radiation': 157.0, 'sunshine': 7.1, 'cloud_cover': 4.0, 'snow_depth': 0.0}, {'month': '05', 'max_temp

In [11]:
y_test[0:2]

9261    16.0
5376    14.6
Name: mean_temp, dtype: float64

In [12]:
Path("models").mkdir(parents=True, exist_ok=True)

In [13]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.model_selection import cross_val_score

In [14]:
# lgb_reg_params = {
#     'learning_rate':    hp.uniform('learning_rate',0.1,1),
#     'max_depth':        hp.choice('max_depth',        np.arange(2, 100, 1, dtype=int)),
#     'min_child_weight': hp.choice('min_child_weight', np.arange(1, 50, 1, dtype=int)),
#     'colsample_bytree': hp.uniform('colsample_bytree',0.4,1),
#     'subsample':        hp.uniform('subsample', 0.6, 1),
#     'num_leaves':       hp.choice('num_leaves',       np.arange(1, 200, 1, dtype=int)),
#     'min_split_gain':   hp.uniform('min_split_gain', 0, 1),
#     'reg_alpha':        hp.uniform('reg_alpha',0,1),
#     'reg_lambda':       hp.uniform('reg_lambda',0,1),
#     'n_estimators':     5
# }

#     # params = {
#     #     "objective": "regression",
#     #     "metric": "rmse",
#     #     "n_estimators": 1000,
#     #     "verbosity": -1,
#     #     "bagging_freq": 1,
#     #     "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
#     #     "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
#     #     "subsample": trial.suggest_float("subsample", 0.05, 1.0),
#     #     "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
#     #     "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
#     # }


# def objective(params):
#   lgbm = lgb.LGBMRegressor(n_jobs=-1,early_stopping_rounds=None,**params)
#   score = cross_val_score(lgbm, X_train, y_train, cv=2,scoring='neg_mean_squared_error',n_jobs=-1).mean()
#   return score

# trials = Trials()
# result = fmin(
#     fn=objective,           # objective function
#     space=lgb_reg_params,   # parameter space
#     algo=tpe.suggest,       # surrogate algorithm
#     max_evals=50,           # no. of evaluations
#     trials=trials           # trials object that keeps track of the sample results (optional)
# )
# print(result)

In [15]:
lgbm_train = lgb.Dataset(X_train, label=y_train)
lgbm_valid = lgb.Dataset(X_test, label=y_test, reference=lgbm_train)

In [16]:
lgbm_train

<lightgbm.basic.Dataset at 0x7fc3fd506160>

In [17]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("compare-models")

2024/08/09 00:38:12 INFO mlflow.tracking.fluent: Experiment with name 'compare-models' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/ubuntu/mlops_temperature_prediction/experiment-tracking/mlruns/1', creation_time=1723144092599, experiment_id='1', last_update_time=1723144092599, lifecycle_stage='active', name='compare-models', tags={}>

In [18]:
with mlflow.start_run():

    mlflow.set_tag("developer", "hema")

    mlflow.log_param("train-data-path", input_file)
    mlflow.log_param("test-data-path",input_file)


    # Liner Regression
    lin_reg = LinearRegression().fit(X_train, y_train)
    y_pred_lin_reg = lin_reg.predict(X_test)
    lin_reg_rmse = mean_squared_error(y_test, y_pred_lin_reg, squared=False)
    mlflow.log_metric("rmse_lr", lin_reg_rmse)
    # mlflow.sklearn.log_model(lin_reg, "lin_reg")
    # with open('models/lin_reg.bin', 'wb') as f_out:
    #     pickle.dump((dv, lin_reg), f_out)
    # mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_mlflow")

    # XG Boost Regressor
    xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train)
    pred_labels_xgb = xgb_model.predict(X_test)
    xgb_rmse = mean_squared_error(y_test, pred_labels_xgb, squared=False)
    mlflow.log_metric("rmse_xgb", xgb_rmse)
    # mlflow.xgboost.log_model(xgb_model, "xgb_reg")
    # with open('models/xgb_reg.bin', 'wb') as f_out:
    #     pickle.dump((dv, xgb_model), f_out)
    # mlflow.log_artifact(local_path="models/xgb_reg.bin", artifact_path="models_mlflow")

    # Gradient Boost Regressor
    gbm_reg = GradientBoostingRegressor(random_state=42)
    gbm_reg.fit(X_train, y_train)
    y_pred_gb_reg = gbm_reg.predict(X_test)
    gbm_reg_rmse = mean_squared_error(y_test, y_pred_gb_reg, squared=False)
    mlflow.log_metric("rmse_gbm", gbm_reg_rmse)
    # mlflow.sklearn.log_model(gbm_reg, "gbm_reg")
    # with open('models/gbm_reg.bin', 'wb') as f_out:
    #     pickle.dump((dv, gbm_reg), f_out)
    # mlflow.log_artifact(local_path="models/gbm_reg.bin", artifact_path="models_mlflow")

    # Light Gradient Boost Regressor
    lgbm_reg =  lgb.LGBMRegressor(random_state=42)
    lgbm_reg.fit(X_train, y_train)
    y_pred_lgb_reg = lgbm_reg.predict(X_test)
    lgbm_reg_rmse = mean_squared_error(y_test, y_pred_lgb_reg, squared=False)
    mlflow.log_metric("rmse_lgbm", lgbm_reg_rmse)
    # mlflow.sklearn.log_model(lgbm_reg, "lgbm_reg")
    # with open('models/lgbm_reg.bin', 'wb') as f_out:
    #     pickle.dump((dv, lgbm_reg), f_out)
    # mlflow.log_artifact(local_path="models/lgbm_reg.bin", artifact_path="models_mlflow")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 933
[LightGBM] [Info] Number of data points in the train set: 10278, number of used features: 18
[LightGBM] [Info] Start training from score 11.487968


In [63]:
def objective(params):

    mlflow.set_experiment("lgbm-best-model")
    with mlflow.start_run():
 
        mlflow.set_tag("developer", "hema")

        mlflow.log_param("train-data-path", input_file)
        mlflow.log_param("test-data-path",input_file)
        mlflow.log_param("model", "lgbm")
    
        lgbm = lgb.LGBMRegressor(n_jobs=-1,**params, metric='rmse')

        lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                 callbacks=[
        lgb.early_stopping(stopping_rounds=50)]
                )
        y_pred_lgbm_reg = lgbm.predict(X_test)
        lgbm_rmse = mean_squared_error(y_test, y_pred_lgbm_reg, squared=False)
        mlflow.log_metric("rmse_lgbm", lgbm_rmse)
        # mlflow.sklearn.log_model(lgbm, "lgbm_reg")
        # with open('models/preprocessor.b', 'wb') as f_out:
        #     pickle.dump((dv, lgbm), f_out)
        # mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="models_mlflow")


        return {'loss': lgbm_rmse, 'status': STATUS_OK}

In [64]:

search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 2000, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'subsample':        hp.uniform('subsample', 0.6, 1),
    'objective': 'regression',
    # "num_leaves": scope.int(hp.quniform("num_leaves", 20, 40, 2)),
    "first_metric_only": True,
    "seed": 42
}

# search_space = {
#         "n_estimators": 100,
#         "learning_rate": hp.loguniform("learning_rate", 0.01, 0.3),
#         "num_leaves": scope.int(hp.quniform("num_leaves", 20, 3000, 20)),
#         "max_depth": scope.int(hp.quniform("max_depth", 3, 12, 1)),
#         "min_data_in_leaf": scope.int(hp.quniform("min_data_in_leaf", 200, 10000, 100)),
#         "max_bin": scope.int(hp.quniform("max_bin", 200, 300, 1)),
#         "lambda_l1": scope.int(hp.quniform("lambda_l1", 0, 100, 5)),
#         "lambda_l2": scope.int(hp.quniform("lambda_l2", 0, 100, 5)),
#         "min_gain_to_split": hp.loguniform("min_gain_to_split", 0, 15),
#         # "bagging_fraction": hp.loguniform(
#         #     "bagging_fraction", 0.2, 0.95, 0.1
#         # ),
#         # # "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
#         # "feature_fraction": hp.loguniform(
#         #     "feature_fraction", 0.2, 0.95, 0.1
#         # ),
#         'seed': 42
#         }

best_result = fmin(
    fn=objective,
    space=search_space,
    # space=lgb_reg_params,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

best_result


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000595 seconds.                 
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 933                                                                                        
[LightGBM] [Info] Number of data points in the train set: 10278, number of used features: 18                            
[LightGBM] [Info] Start training from score 11.487968                                                                   
Training until validation scores don't improve for 50 rounds                                                            
Early stopping, best iteration is:                                                                                      
[32]	valid_0's rmse: 0.911656
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000535 seconds.                 
You can set `force_col_wise=true` to remove the overhead.
[LightG

{'learning_rate': 0.04994117707179691,
 'max_depth': 44.0,
 'min_child_weight': 3.3282371990782518,
 'n_estimators': 1481.0,
 'reg_alpha': 0.0156202048506844,
 'reg_lambda': 0.0850555088621661,
 'subsample': 0.7520999514353608}

In [84]:
print("The hyperparameters with the best score is ", best_result)

The hyperparameters with the best score is  {}


In [91]:

def objective(params):

    mlflow.set_experiment("log-best-model")
    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    
    with mlflow.start_run():
         
        mlflow.set_tag("developer", "hema")

        mlflow.log_param("train-data-path", input_file)
        mlflow.log_param("test-data-path",input_file)
        mlflow.log_param("model", "lgbm")
    
        lgbm = lgb.LGBMRegressor(n_jobs=-1,**params, metric='rmse')

        lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                 callbacks=[
        lgb.early_stopping(stopping_rounds=50)]
                )
        y_pred_lgbm_reg = lgbm.predict(X_test)
        lgbm_rmse = mean_squared_error(y_test, y_pred_lgbm_reg, squared=False)
        mlflow.log_metric("rmse_lgbm", lgbm_rmse)
        mlflow.sklearn.log_model(lgbm, "lgbm_reg")
        with open('models/preprocessor.b', 'wb') as f_out:
            pickle.dump((dv, lgbm), f_out)
        mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="models_mlflow")
    
        return {'loss': lgbm_rmse, 'status': STATUS_OK}

In [92]:
search_space = {
     'learning_rate': 0.04994117707179691,
     'max_depth': 44,
     'min_child_weight': 3.3282371990782518,
     'n_estimators': 1481,
     'reg_alpha': 0.0156202048506844,
     'reg_lambda': 0.0850555088621661,
     'subsample': 0.7520999514353608,
     'objective': 'regression',
      "seed": 42
}


best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=1,
    trials=Trials()
)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000531 seconds.                 
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 933                                                                                        
[LightGBM] [Info] Number of data points in the train set: 10278, number of used features: 18                            
[LightGBM] [Info] Start training from score 11.487968                                                                   
Training until validation scores don't improve for 50 rounds                                                            
Early stopping, best iteration is:                                                                                      
[161]	valid_0's rmse: 0.880649
  0%|                                                                             | 0/1 [00:01<?, ?trial/s, best loss=?]




100%|███████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.53s/trial, best loss: 0.8806487710167833]
