# Remark:
This code needs to be runned once

In [21]:
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from arize.pandas.logger import Client, Schema
import datetime as dt
from arize.utils.types import ModelTypes, Environments
warnings.filterwarnings("ignore")

In [2]:
version = "v2.0"
data_url = r'C:\work\Studies\Finalyear\1stsemester\Bassem\repo_clone\MLOps-Training\movies.csv'


In [3]:
import sys
sys.path.insert(0, r'C:\work\Studies\Finalyear\1stsemester\Bassem\repo_clone\MLOps-Training\backend\src')

from data_preprocessing_training import clean_data
from clean_data_json import clean_data_json

In [4]:
from data_preprocessing_monitoring import transform_data
from clean_data_json import clean_data_json

In [5]:
from dotenv import load_dotenv
import os
load_dotenv("C:\work\Studies\Finalyear\1stsemester\Bassem\repo_clone\MLOps-Training\backend\src\.env")

DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token")

In [6]:
#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/aymenalimii4070/Ml_OPS_Movies.mlflow') #your mlfow tracking uri
mlflow.set_experiment("Movies-Rating-experiment-3")

<Experiment: artifact_location='mlflow-artifacts:/4472f92fb5c7414ca726f371156b38d7', creation_time=1734289418805, experiment_id='2', last_update_time=1734289418805, lifecycle_stage='active', name='Movies-Rating-experiment-3', tags={}>

In [7]:
#read the data
raw_train = pd.read_csv(data_url)

In [8]:
raw_train.head(3)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,


In [9]:
! pip install scipy==1.10.1



In [10]:
#cleaning and preprocessing
X,y = transform_data(raw_train)

In [11]:
#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

Downloading artifacts: 100%|██████████| 9/9 [00:01<00:00,  7.68it/s]  
 - psutil (current: 6.1.0, required: psutil==5.9.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.xgboost
  run_id: c3ddc94efcb44cb986e0bda97a54ebed



## Transform the training data before sending it to Arize AI :

In [12]:
selected_cols = ['category','amt', 'gender', 'zip', 'lat', 'long', 'dob', 'merch_lat', 'merch_long',"trans_date_trans_time" , 'is_fraud']

In [13]:
baseline = raw_train[selected_cols]

KeyError: "None of [Index(['category', 'amt', 'gender', 'zip', 'lat', 'long', 'dob', 'merch_lat',\n       'merch_long', 'trans_date_trans_time', 'is_fraud'],\n      dtype='object')] are in the [columns]"

In [20]:
baseline["trans_date_trans_time"] = pd.to_datetime(baseline["trans_date_trans_time"])

In [21]:

# Extract age of card holder column
baseline['age'] = dt.date.today().year - pd.to_datetime(baseline['dob']).dt.year
# drop unusefull columns
baseline.drop(["dob"], axis=1, inplace=True)

In [23]:
baseline.drop(["trans_date_trans_time"], axis=1, inplace=True)

In [24]:
baseline.rename(columns = {'is_fraud':'actual_label'}, inplace = True)

In [32]:
transform_bin_str = { 0 : 'non_fraud', 1 : 'fraud'}
baseline['actual_label'] = baseline['actual_label'].map(transform_bin_str)

In [18]:
preds = loaded_model.predict(X)

In [26]:
baseline['prediction_label'] = preds

In [33]:
baseline['prediction_label'] = baseline['prediction_label'].map(transform_bin_str)

In [27]:
import uuid
# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

In [28]:
baseline["prediction_id"]=generate_prediction_ids(baseline)

In [34]:
baseline.head(3)

Unnamed: 0,category,amt,gender,zip,lat,long,merch_lat,merch_long,actual_label,age,year,month,day,hour,sec,prediction_label,prediction_id
0,misc_net,4.97,F,28654,36.0788,-81.1781,36.011293,-82.048315,non_fraud,36,2019,1,1,0,18,non_fraud,b638c4f4-d612-43c2-9b94-6c8af634c44c
1,grocery_pos,107.23,F,99160,48.8878,-118.2105,49.159047,-118.186462,non_fraud,46,2019,1,1,0,44,non_fraud,cc50ea0f-d36e-43cd-9d7f-abd4f957a648
2,entertainment,220.11,M,83252,42.1808,-112.262,43.150704,-112.154481,non_fraud,62,2019,1,1,0,51,non_fraud,37cd7fd3-67f3-4157-aa8a-b6668060f932


## Setup Arize AI :

In [18]:
SPACE_KEY = "U3BhY2U6MTM0MjQ6bEM2bw=="
API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MzQzOTYxMDEsInVzZXJJZCI6MTg5NjYsInV1aWQiOiI4MTA2ZjAwOS04YjAyLTRiMzYtODljOS0yOGE0ODFmZGYyNDgiLCJpc3MiOiJodHRwczovL2FwcC5hcml6ZS5jb20ifQ.xdelPXIdJeFsnik1Hr2IVhbPZAA19f8BXZh4ZQ_k11U"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "fraud-detector-model"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

NameError: name 'register_otel' is not defined

In [35]:
SPACE_KEY = "..."
API_KEY = "..."

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "fraud-detector-model"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


## Send train data to Arize AI :
the training data will be the reference data later in production

In [36]:
features = feature_column_names=list(baseline.columns.drop(
        ["prediction_id", "prediction_label", "actual_label"]))

In [37]:
# Define a Schema() object for Arize to pick up data from the correct columns for logging
training_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction_label",
    actual_label_column_name="actual_label",
    feature_column_names=features)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=baseline,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjo3MjgyOktFcHc=/spaces/U3BhY2U6NzY1Njp5eHY1/models/modelName/fraud-detector-model?selectedTab=dataIngestion[0m
✅ You have successfully logged training set to Arize
