### Install packages

In [None]:
%pip install openai==1.12.0

### Required packages

In [None]:
import openai
from openai import AzureOpenAI
import json
from IPython import get_ipython
from IPython.terminal.interactiveshell import TerminalInteractiveShell
import uuid
import mlflow

from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
from pyspark.ml import Pipeline

from synapse.ml.isolationforest import *

from synapse.ml.explainers import *

%matplotlib inline

from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

StatementMeta(, , , Waiting, )

### Set up Azure OpenAI connection
###### Reference: https://community.fabric.microsoft.com/t5/Hack-Together/Fabric-cant-import-AzureOpenAI/m-p/3703267#M66

In [None]:
client = AzureOpenAI(
    # Include endpoint - for the competition it was https://polite-ground-030dc3103.4.azurestaticapps.net/api/v1
    azure_endpoint="ADD ENDPOINT",
    api_key="YOUR API KEY",  # Add API KEY
    api_version="2023-09-01-preview",
)

StatementMeta(, , , Waiting, )

In [None]:
# Assuming df is your DataFrame loaded from the delta format
df = spark.read.format("delta").load(
    "abfss://MS_Fabric_Hackathon@onelake.dfs.fabric.microsoft.com/Hackathon.Lakehouse/Tables/Fact_Sales")
df.dtypes

### Get Data

In [None]:
df = spark.read.format("delta").load(
    "abfss://MS_Fabric_Hackathon@onelake.dfs.fabric.microsoft.com/Hackathon.Lakehouse/Tables/Fact_Sales")

# Ensure 'Quantity' and 'Unit_Price' are treated as numeric types
df = df.withColumn("Quantity", F.col("Quantity").cast(IntegerType())) \
       .withColumn("Unit_Price", F.col("Unit_Price").cast(FloatType()))

# Perform aggregation
df = df.groupBy("Date") \
    .agg(
        F.sum("Quantity").alias("Quantity"),  # Sum of Quantity
        (F.sum(F.col("Quantity") * F.col("Unit_Price")) / F.sum("Quantity")
         ).alias("Unit_Price")  # Average Price calculation
)

# Cast columns as needed
df_adjusted = (
    df.orderBy("Date")
    .withColumn("Date", col("Date").cast(DateType()))
    .withColumn("Quantity", col("Quantity").cast(IntegerType()))
    .withColumn("Unit_Price", col("Unit_Price").cast(FloatType()))
    .select(
        "Date",
        "Quantity",
        "Unit_Price"
    )
)

# Convert Spark DataFrame to Pandas DataFrame
pd_df_adjusted = df_adjusted.toPandas()

pd_df_adjusted

### Create prompt

In [None]:
# Prompt is a combination of 3 parts:

prompt_1 = "Given this data: "
prompt_2 = pd_df_adjusted
prompt_3 = """
Please provide the code to run an Isolation Forest model with the parameters below.

contamination=0.1, 
n_estimators=16,
max_samples=210, 
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

Only provide the python code in your response, please don't include any words different from the code.Please exclude any comments in the response, just provide the code
Please ensure all arrays provided are of the same length
Please use the data from the dataframe pd_df_adjusted provided in the prompt, do not try to load or add other data.
To run the model, split the data for training and test, according to the start and end times provided. Make sure to use and define the parameters detailed 
Please store the model results in predictions with 

"""

# Combine the 3 into a single text
PROMPT = f"{prompt_1}\n\n{prompt_2}\n\n{prompt_3}"

print(PROMPT)

StatementMeta(, , , Waiting, )

Given this data: 

           Date  Quantity  Unit_Price
0    2023-01-01      4893   50.592808
1    2023-01-02      4642   50.592224
2    2023-01-03      4832   49.851822
3    2023-01-04      4726   49.725010
4    2023-01-05      4859   48.917988
..          ...       ...         ...
360  2023-12-27      4574   48.591583
361  2023-12-28      4649   49.971371
362  2023-12-29      4323   51.576405
363  2023-12-30      4674   49.034725
364  2023-12-31      4476   49.054333

[365 rows x 3 columns]


Please provide the code to run an Isolation Forest model with the parameters below.

contamination=0.1, 
n_estimators=16,
max_samples=210, 
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

Only provide the python code in your response, please don't include any words different from the code.Please exclude any comments in the response, just provide the code
Please ensure all arrays provided are of t

### Get AzureOpenAI response

In [None]:
# Run using AzureOpenAI - Select the model and temperature
MESSAGES = []
MESSAGES.append({"role": "user", "content": PROMPT})
completion = client.chat.completions.create(
    # model and temperature adjusted as suggested here: https://www.reddit.com/r/ChatGPTCoding/comments/12i6k06/best_temperature_for_gpt4_api_to_get_quality/
    model="gpt-4", messages=MESSAGES, temperature=0.1)
response = completion.choices[0].message.content
print(response)

StatementMeta(, , , Waiting, )

import pandas as pd
from sklearn.ensemble import IsolationForest

# Define parameters
contamination=0.1
n_estimators=16
max_samples=210
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

# Convert Date column to datetime
pd_df_adjusted['Date'] = pd.to_datetime(pd_df_adjusted['Date'])

# Split data into training and test sets
training_data = pd_df_adjusted[(pd_df_adjusted['Date'] >= trainingStartTime) & (pd_df_adjusted['Date'] <= trainingEndTime)]
test_data = pd_df_adjusted[(pd_df_adjusted['Date'] >= inferenceStartTime) & (pd_df_adjusted['Date'] <= inferenceEndTime)]

# Define features and target
X_train = training_data[['Quantity', 'Unit_Price']]
X_test = test_data[['Quantity', 'Unit_Price']]

# Initialize and fit the model
model = IsolationForest(contamination=contamination, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features)
model.fit(X_train)

# Make prediction

### Execute response

In [None]:
cleaned_response = response.replace("python\n", "").replace("", "")
cleaned_response = cleaned_response.replace("```", "")

print(cleaned_response)

exec(cleaned_response)

StatementMeta(, , , Waiting, )

import pandas as pd
from sklearn.ensemble import IsolationForest

# Define parameters
contamination=0.1
n_estimators=16
max_samples=210
max_features=1.0
trainingStartTime = "2023-01-01"
trainingEndTime = "2023-07-31"
inferenceStartTime = "2023-08-01"
inferenceEndTime = "2023-08-31"

# Convert Date column to datetime
pd_df_adjusted['Date'] = pd.to_datetime(pd_df_adjusted['Date'])

# Split data into training and test sets
training_data = pd_df_adjusted[(pd_df_adjusted['Date'] >= trainingStartTime) & (pd_df_adjusted['Date'] <= trainingEndTime)]
test_data = pd_df_adjusted[(pd_df_adjusted['Date'] >= inferenceStartTime) & (pd_df_adjusted['Date'] <= inferenceEndTime)]

# Define features and target
X_train = training_data[['Quantity', 'Unit_Price']]
X_test = test_data[['Quantity', 'Unit_Price']]

# Initialize and fit the model
model = IsolationForest(contamination=contamination, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features)
model.fit(X_train)

# Make prediction





In [None]:
# Add predictions to the test_data DataFrame
test_data['Anomaly'] = predictions

# Convert anomaly labels from -1 (outlier) and 1 (inlier) to a more readable format
test_data['Anomaly'] = test_data['Anomaly'].map({-1: 'Outlier', 1: 'Inlier'})

# Display the test_data DataFrame with the added Anomaly column
print(test_data)

StatementMeta(, , , Waiting, )

          Date  Quantity  Unit_Price  Anomaly
212 2023-08-01      4428   49.069489   Inlier
213 2023-08-02      4224   48.077393  Outlier
214 2023-08-03      4571   49.976353   Inlier
215 2023-08-04      4935   51.300648  Outlier
216 2023-08-05      4317   50.990826   Inlier
217 2023-08-06      4234   50.084789   Inlier
218 2023-08-07      4548   50.882629   Inlier
219 2023-08-08      4787   50.120033   Inlier
220 2023-08-09      4468   48.766651   Inlier
221 2023-08-10      4267   48.059292  Outlier
222 2023-08-11      4893   51.206539  Outlier
223 2023-08-12      4408   48.359188   Inlier
224 2023-08-13      4159   50.247944  Outlier
225 2023-08-14      4407   49.460243   Inlier
226 2023-08-15      4667   51.245384   Inlier
227 2023-08-16      4646   50.199764   Inlier
228 2023-08-17      4816   50.666508   Inlier
229 2023-08-18      4457   49.744469   Inlier
230 2023-08-19      4731   50.206150   Inlier
231 2023-08-20      4442   50.660851   Inlier
232 2023-08-21      4372   47.9027

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Anomaly'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Anomaly'] = test_data['Anomaly'].map({-1: 'Outlier', 1: 'Inlier'})
