# **Modelling**

The intended use of this .ipynb file is to take the processed data file and train multiple models aimed at detecting pedestrians, all while employing Mlflow tracking.

## Load the processed data

In [None]:
# We retrieve the data from the google drive folder.

from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
# Replace the path with the correct path for each individual (the path has been set up to work for all three members of the team)

DATA_FOLDER = "/content/drive/MyDrive/TAED2-PedestrianDetection/Datasets/PennFudan/Processed data"

In [None]:
%%shell

# TorchVision repo is downloaded to use some reference files
git clone https://github.com/pytorch/vision.git
cd vision
git checkout v0.15.1

cp references/detection/utils.py ../
cp references/detection/transforms.py ../
cp references/detection/coco_eval.py ../
cp references/detection/engine.py ../
cp references/detection/coco_utils.py ../

fatal: destination path 'vision' already exists and is not an empty directory.
HEAD is now at 42759b1cc8 Version number bump for vision (#7419)




In [None]:
# Add the directory to the current Python path
import sys

sys.path.append(DATA_FOLDER)

In [None]:
# Using this code we load each dataset from their respective file.
# Download preprocessed data

import pickle
from PedestrianDatasetClass import PedestrianDataset  # Import the required class

with open(DATA_FOLDER + '/training_dataset.pkl', 'rb') as file:
    training_dataset = pickle.load(file)

with open(DATA_FOLDER + '/validation_dataset.pkl', 'rb') as file:
    validation_dataset = pickle.load(file)

with open(DATA_FOLDER + '/testing_dataset.pkl', 'rb') as file:
    testing_dataset = pickle.load(file)



In [None]:
training_dataset[2]

(tensor([[[0.5294, 0.3373, 0.3098,  ..., 0.5725, 0.5216, 0.4706],
          [0.3922, 0.2784, 0.3059,  ..., 0.5647, 0.5098, 0.4627],
          [0.2824, 0.2353, 0.2863,  ..., 0.5608, 0.5059, 0.4588],
          ...,
          [0.5569, 0.5569, 0.5412,  ..., 0.4510, 0.4510, 0.4510],
          [0.5490, 0.5451, 0.5373,  ..., 0.4392, 0.4392, 0.4392],
          [0.5333, 0.5373, 0.5333,  ..., 0.4353, 0.4353, 0.4353]],
 
         [[0.6314, 0.4392, 0.4118,  ..., 0.5686, 0.5176, 0.4667],
          [0.4941, 0.3804, 0.4078,  ..., 0.5608, 0.5059, 0.4588],
          [0.3765, 0.3294, 0.3804,  ..., 0.5569, 0.5020, 0.4549],
          ...,
          [0.5608, 0.5608, 0.5451,  ..., 0.4549, 0.4549, 0.4549],
          [0.5529, 0.5490, 0.5412,  ..., 0.4431, 0.4431, 0.4431],
          [0.5373, 0.5412, 0.5373,  ..., 0.4392, 0.4392, 0.4392]],
 
         [[0.5255, 0.3333, 0.3059,  ..., 0.5529, 0.5020, 0.4510],
          [0.3961, 0.2745, 0.3098,  ..., 0.5451, 0.4902, 0.4431],
          [0.2902, 0.2353, 0.2941,  ...,

## Set up Mlflow

In [None]:
!pip install mlflow



In [None]:
import mlflow
import mlflow.pytorch
import mlflow.sklearn
from mlflow import log_metric, log_param, log_params, log_artifacts

In [None]:
!pip install python-dotenv



In [None]:
# clone the repository to load the .env file
!git clone https://github.com/MLOps-essi-upc/taed2-PedestrianDetection.git


fatal: destination path 'taed2-PedestrianDetection' already exists and is not an empty directory.


In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Run your MLflow script (script.py)
os.system("python script.py")

# Load environment variables from .env
load_dotenv()

# Set the MLflow tracking URI and authentication variables
mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"]
mlflow_username = os.environ["MLFLOW_TRACKING_USERNAME"]
mlflow_password = os.environ["MLFLOW_TRACKING_PASSWORD"]

# Set up MLflow with the loaded credentials
mlflow.set_tracking_uri(mlflow_tracking_uri)

KeyError: ignored

In [None]:
# Check if the connection is correct with a 'prova' experiment

# Start an MLflow experiment
mlflow.set_experiment('prova')
mlflow.start_run()

# Your machine learning code here
# For demonstration purposes, let's say you're training a simple model
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Log the model and metrics in MLflow
mlflow.sklearn.log_model(model, "random_forest_model")
mlflow.log_metric("accuracy", accuracy)

# End the MLflow run
mlflow.end_run()


2023/10/01 08:48:04 INFO mlflow.tracking.fluent: Experiment with name 'prova' does not exist. Creating a new experiment.
