### Install packages

In [None]:
%pip install openai==1.12.0

### Required packages

In [None]:
import openai
from openai import AzureOpenAI
import json
from IPython import get_ipython
from IPython.terminal.interactiveshell import TerminalInteractiveShell
import uuid
import mlflow

from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.ml import Pipeline

from synapse.ml.isolationforest import *

from synapse.ml.explainers import *

%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

### Set up Azure OpenAI connection
###### Reference: https://community.fabric.microsoft.com/t5/Hack-Together/Fabric-cant-import-AzureOpenAI/m-p/3703267#M66

In [None]:
client = AzureOpenAI(
    # Include endpoint - for the competition it was https://polite-ground-030dc3103.4.azurestaticapps.net/api/v1





    azure_endpoint="ADD ENDPOINT",
    api_key="YOUR API KEY",  # Add API KEY
    api_version="2023-09-01-preview",





)

In [None]:
# Assuming df is your DataFrame loaded from the delta format
df = spark.read.format("delta").load(
    "abfss://MS_Fabric_Hackathon@onelake.dfs.fabric.microsoft.com/Hackathon.Lakehouse/Tables/Fact_Sales")
df.dtypes

### Get Data

In [None]:
df = spark.read.format("delta").load(
    "abfss://MS_Fabric_Hackathon@onelake.dfs.fabric.microsoft.com/Hackathon.Lakehouse/Tables/Fact_Sales")



# Ensure 'Quantity' and 'Unit_Price' are treated as numeric types


df = df.withColumn("Quantity", F.col("Quantity").cast(IntegerType())) \
       .withColumn("Unit_Price", F.col("Unit_Price").cast(FloatType()))


# Perform aggregation


df = df.groupBy("Date") \
    .agg(
        F.sum("Quantity").alias("Quantity"),  # Sum of Quantity
        (F.sum(F.col("Quantity") * F.col("Unit_Price")) / F.sum("Quantity")
         ).alias("Unit_Price")  # Average Price calculation
)



# Cast columns as needed



df_adjusted = (
    df.orderBy("Date")
    .withColumn("Date", col("Date").cast(DateType()))
    .withColumn("Quantity", col("Quantity").cast(IntegerType()))
    .withColumn("Unit_Price", col("Unit_Price").cast(FloatType()))




    .select(
        "Date",
        "Quantity",
        "Unit_Price"
    )
)



# Convert Spark DataFrame to Pandas DataFrame


pd_df_adjusted = df_adjusted.toPandas()


pd_df_adjusted

### Create prompt

In [None]:
# Prompt is a combination of 3 parts:

prompt_1 = "Given this data: "
prompt_2 = pd_df_adjusted
prompt_3 = """
Please run an Isolation Forest model with the parameters below and share the results.

contamination=0.1, 
n_estimators=16,
max_samples=210, 
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

"""

# Combine the 3 into a single text
PROMPT = f"{prompt_1}\n\n{prompt_2}\n\n{prompt_3}"

print(PROMPT)

In [None]:
# Prompt is a combination of 3 parts:

prompt_1 = "Given this data: "
prompt_2 = pd_df_adjusted
prompt_3 = """
Can you provide the code to show the Isolation Forest model results with these parameters. 
Only provide the python code in your response, please don't include any words different from the code.Please exclude any comments in the response, just provide the code

Please consider these parameters for the model:
contamination=0.1, 
n_estimators=100, 16
max_samples=210, 
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

Please ensure all arrays provided are of the same length
Please use the data from the dataframe pd_df_adjusted provided in the prompt, do not try to load or add other data.
To run the model, split the data for training and test, according to the start and end times provided

"""

# Combine the 3 into a single text
PROMPT = f"{prompt_1}\n\n{prompt_2}\n\n{prompt_3}"

print(PROMPT)

### Get AzureOpenAI response

In [None]:
# Run using AzureOpenAI - Select the model and temperature
MESSAGES = []
MESSAGES.append({"role": "user", "content": PROMPT})
completion = client.chat.completions.create(
    # model and temperature adjusted as suggested here: https://www.reddit.com/r/ChatGPTCoding/comments/12i6k06/best_temperature_for_gpt4_api_to_get_quality/
    model="gpt-4", messages=MESSAGES, temperature=0.1)
response = completion.choices[0].message.content
print(response)

### Execute response

In [None]:
cleaned_response = response.replace("python\n", "").replace("", "")
cleaned_response = cleaned_response.replace("```", "")
print(cleaned_response)
exec(cleaned_response)

### Display results

In [None]:
import numpy as np

# Add predictions back to the inference DataFrame
# Note: The predictions are -1 for outliers and 1 for inliers
inference_data['Anomaly'] = anomalies

# If desired, convert anomalies from -1/1 to a more readable format (e.g., True/False or "Outlier"/"Inlier")
inference_data['Anomaly'] = np.where(
    inference_data['Anomaly'] == -1, 'Outlier', 'Inlier')

# Display the updated inference data with anomaly predictions
print(inference_data)