In [1]:
import pandas as pd
import numpy as np
import warnings
import pickle
import mlflow
import mlflow.sklearn
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")


In [2]:
import mlflow
import mlflow.sklearn

# Force single project-level tracking
mlflow.set_tracking_uri("file:../mlruns")

# ALWAYS recreate experiment cleanly
mlflow.set_experiment("weather_prediction_pipeline")


2026/01/13 02:09:44 INFO mlflow.tracking.fluent: Experiment with name 'weather_prediction_pipeline' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/megharathi/Desktop/MLOpsPortCast/notebooks/../mlruns/997609320504467582', creation_time=1768250384347, experiment_id='997609320504467582', last_update_time=1768250384347, lifecycle_stage='active', name='weather_prediction_pipeline', tags={}>

In [3]:
#Load the processed dataset (v1)

DATASET_NAME = "processed_weather_data_portofTurku"
DATASET_VERSION = "v1"

DATASET_PATH = "../Dataset/processed/v1/weather_dataset_processed.csv"

df = pd.read_csv(DATASET_PATH)

print("Dataset:", DATASET_NAME)
print("Version:", DATASET_VERSION)
print("Shape:", df.shape)

df.head()


Dataset: processed_weather_data_portofTurku
Version: v1
Shape: (96449, 10)


Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_condition,Future_weather_condition
0,2006-04-01 04:00:00+02:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 05:00:00+02:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 06:00:00+02:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 07:00:00+02:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 08:00:00+02:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


In [4]:
#Train–test split (80–20)
X = df.drop(columns=["Future_weather_condition"])
y = df["Future_weather_condition"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (77159, 9)
Test size: (19290, 9)


In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


(77159, 9)
(19290, 9)
Future_weather_condition
1    0.853005
0    0.146995
Name: proportion, dtype: float64
Future_weather_condition
1    0.852981
0    0.147019
Name: proportion, dtype: float64


In [6]:
import os

# Define split save path (aligned with your repo)
SPLIT_PATH = "../Dataset/processed/v1"
os.makedirs(SPLIT_PATH, exist_ok=True)

# Recreate full train and test DataFrames
train_df = X_train.copy()
train_df["Future_weather_condition"] = y_train

test_df = X_test.copy()
test_df["Future_weather_condition"] = y_test

# Save splits
train_df.to_csv(f"{SPLIT_PATH}/training_dataset.csv", index=False)
test_df.to_csv(f"{SPLIT_PATH}/testing_dataset.csv", index=False)

print("Saved train.csv and test.csv in Dataset/processed/v1")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Saved train.csv and test.csv in Dataset/processed/v1
Train shape: (77159, 10)
Test shape: (19290, 10)


## Register datasets in MLflow

In [8]:
import mlflow

DATASET_NAME = "processed_weather_data_portofTurku"
DATASET_VERSION = "v1"
SPLIT_PATH = "../Dataset/processed/v1"

with mlflow.start_run(run_name="data_registration_v1"):

    mlflow.log_param("dataset_name", DATASET_NAME)
    mlflow.log_param("dataset_version", DATASET_VERSION)
    mlflow.log_param("train_rows", len(train_df))
    mlflow.log_param("test_rows", len(test_df))

    mlflow.log_artifact(f"{SPLIT_PATH}/training_dataset.csv", artifact_path="datasets")
    mlflow.log_artifact(f"{SPLIT_PATH}/testing_dataset.csv", artifact_path="datasets")

    print("✅ Dataset registered in MLflow")


✅ Dataset registered in MLflow
