# Generative AI applications using MLFlow

In [1]:
import mlflow
import os 
# from langchain.chat_models import AzureChatOpenAI
# from openai import AzureOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv
import dagshub
import pandas as pd 
import openai 

# load_dotenv()


In [2]:
# creating test data 
eval_data=pd.DataFrame(
    {
        'inputs':[
            "what is MLflow?",
            "What is Spark?",
        ],
        "ground_truth":[
            "MLflow is an open-source platform designed to manage the machine learning (ML) lifecycle, which includes experimentation, reproducibility, and deployment of ML models. It was developed by Databricks to address the challenges of managing ML projects and has since become a widely adopted tool in the ML community.",
            "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It provides a fast and general-purpose engine for processing large-scale data with high efficiency and scalability."
        ]
    }
)

In [3]:
import dagshub
dagshub.init(repo_owner='Immortal-Pi', repo_name='mlflow-genAI-test', mlflow=True)


In [8]:
mlflow.set_experiment('LLM Evaluation')
openai.api_type='azure'
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = "https://llmops-amruth.openai.azure.com/"
os.environ["OPENAI_DEPLOYMENT_NAME"] = "gpt-4"
os.environ["OPENAI_API_VERSION"] = os.getenv('AZURE_OpenAI_API_VERSION')
os.environ["OPENAI_API_KEY"] = os.getenv('GRAPHRAG_API_KEY')
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv('GRAPHRAG_API_KEY')
with mlflow.start_run() as run:
    system_prompt='Answer the following questions in 2 sentences'
    logged_model_info=mlflow.openai.log_model(
        model='gpt-4',
        task=openai.chat.completions,
        artifact_path='model',
        messages=[
            {'role':'system','content':system_prompt},
            {'role':'user','content':"{question}"}
        ]
    )

    #use predefined question-answer metrics to evaluate our model
    results=logged_model_info=mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets='ground_truth',
        model_type='question-answering',
        extra_metrics=[mlflow.metrics.toxicity(),
                       mlflow.metrics.latency(),
                       mlflow.metrics.genai.answer_similarity(),
                       mlflow.metrics.flesch_kincaid_grade_level(), 
                       mlflow.metrics.ari_grade_level(),
                       mlflow.metrics.exact_match(),
                       mlflow.metrics.bleu(),
                       mlflow.metrics.rouge1()
                       ]
    )
    print(f'see the aggregate results below: \n{results.metrics}')

    # Evaluation result for each data record is available in results.tables
    eval_table=results.tables['eval_results_table']
    df=pd.DataFrame(eval_table)
    df.to_csv('eval.csv')
    print(f'see evaluation table below: \n{eval_table}')


Downloading artifacts: 100%|██████████| 5/5 [00:01<00:00,  4.11it/s]
2025/01/18 20:29:46 INFO mlflow.models.evaluation.evaluators.default: Computing model predictions.
2025/01/18 20:29:51 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:02<00:00,  2.23s/it]
100%|██████████| 2/2 [00:02<00:00,  1.41s/it]


see the aggregate results below: 
{'latency/mean': 2.3138840198516846, 'latency/variance': 0.5617295759253125, 'latency/p90': 2.9134729862213136, 'exact_match/v1': 0.0, 'answer_similarity/v1/mean': 4.5, 'answer_similarity/v1/variance': 0.25, 'answer_similarity/v1/p90': 4.9}


Downloading artifacts: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
Downloading artifacts: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]


see evaluation table below: 
            inputs                                       ground_truth  \
0  what is MLflow?  MLflow is an open-source platform designed to ...   
1   What is Spark?  Apache Spark is an open-source, distributed co...   

                                             outputs   latency  token_count  \
0  MLflow is an open-source platform primarily de...  3.063370           68   
1  Apache Spark is an open-source unified analyti...  1.564398           47   

   answer_similarity/v1/score  \
0                           4   
1                           5   

                  answer_similarity/v1/justification  
0  The output provided aligns closely with the ta...  
1  The provided output closely aligns with the ta...  
🏃 View run useful-lark-132 at: https://dagshub.com/Immortal-Pi/mlflow-genAI-test.mlflow/#/experiments/0/runs/89c3ebbd06c74347968ad21de6949e9d
🧪 View experiment at: https://dagshub.com/Immortal-Pi/mlflow-genAI-test.mlflow/#/experiments/0
