### Passing ICT & PRTSIR Ticket Text (summary + description) into OpenAI API
- key needed
- kernel = mypython
- can either pass in pre-cleaned text, or uncleaned text (and clean after)

In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
input_dataset = dataiku.Dataset("PRTSIR_ICT_dataset_stacked")
df = input_dataset.get_dataframe()

In [0]:
df.sample(5)

In [0]:
len(df)

In [0]:
prtsir_count = df['key'].str.startswith('PRTSIR').sum()
ict_count = df['key'].str.startswith('ICT').sum()

print(f"Number of entries starting with 'PRTSIR': {prtsir_count}")
print(f"Number of entries starting with 'ICT': {ict_count}")

In [0]:
#find average number of toens per row for each ticket type.
#tokenize text for each ticket and calculate average
def average_tokens_per_row(df, class_column, text_column, class_value):
    class_rows = df[df[class_column] == class_value]
    tokens_per_row = class_rows[text_column].apply(lambda x: len(x.split()))
    avg_tokens = tokens_per_row.mean()
    return avg_tokens

avg_tokens_PRTSIR = average_tokens_per_row(df, 'label', 'Text', 1)
avg_tokens_ICT = average_tokens_per_row(df, 'label', 'Text', 0)
print("Average tokens per PRTSIR ticket:", avg_tokens_PRTSIR)
print("Average tokens per ICT ticket:", avg_tokens_ICT)

In [0]:
#calculate number of samples that can be processed per class
#Given the total token budget, calculate the number of samples for each class
#divide the available tokens by the sum of the average tokens for both classes.
#Then, multiply the result by the average tokens for each class.

total_budget = 18 * 1000  # Assuming $18 worth of tokens are equivalent to 18,000 tokens
total_avg_tokens = avg_tokens_PRTSIR + avg_tokens_ICT

samples_PRTSIR = int((total_budget / total_avg_tokens) * avg_tokens_PRTSIR)
samples_ICT = int((total_budget / total_avg_tokens) * avg_tokens_ICT)
print("Total tokens for PRTSIR:", samples_PRTSIR)
print("Total tokens for ICT:",samples_ICT)

In [0]:
rows_PRTSIR = samples_PRTSIR / avg_tokens_PRTSIR
rows_PRTSIR

rows_ICT = samples_ICT / avg_tokens_ICT
rows_ICT

In [0]:
def sample_class(df, class_column, class_value, n_samples):
    class_rows = df[df[class_column] == class_value]
    n_samples = min(n_samples, len(class_rows))  # Take the minimum of n_samples and the number of rows in the class
    sampled_rows = class_rows.sample(n=n_samples, random_state=42)
    return sampled_rows

In [0]:
## Double Check math ##
#Calculate the proportion of tokens for each class
proportion_PRTSIR = avg_tokens_PRTSIR / total_avg_tokens
proportion_ICT = avg_tokens_ICT / total_avg_tokens
print("Proportion PRTSIR:", proportion_PRTSIR)
print("Proportion ICT:", proportion_ICT)

#Allocate tokens to each class based on the proportions
tokens_PRTSIR = int(total_budget * proportion_PRTSIR)
tokens_ICT = int(total_budget * proportion_ICT)
print("tokens for PRTSIR:", tokens_PRTSIR)
print("tokens for ICT:", tokens_ICT)

#Calculate the number of rows for each class
rows_PRTSIR = int(tokens_PRTSIR / avg_tokens_PRTSIR)
rows_ICT = int(tokens_ICT / avg_tokens_ICT)
print("num rows for PRTSIR:", rows_PRTSIR)
print("num rows for ICT:", rows_ICT)

#Sample the dataset using the calculated number of rows for each class
sampled_PRTSIR = sample_class(df, 'label', 1, rows_PRTSIR)
print("subset PRTSIR:", len(sampled_PRTSIR))
sampled_ICT = sample_class(df, 'label', 0, rows_ICT)
print("subset ICT:", len(sampled_ICT))

sampled_df = pd.concat([sampled_PRTSIR, sampled_ICT]).reset_index(drop=True)
sampled_df

In [0]:
#drop 1 row from each class with max length
max_length_PRTSIR_index = sampled_df[sampled_df['label'] == 1]['Text'].str.len().idxmax()
max_length_ICT_index = sampled_df[sampled_df['label'] == 0]['Text'].str.len().idxmax()

sampled_df = sampled_df.drop([max_length_PRTSIR_index, max_length_ICT_index], axis=0).reset_index(drop=True)
len(sampled_df)

In [0]:
#row_to_drop_PRTSIR = sampled_df[sampled_df['label'] == 1].index[0]
#row_to_drop_ICT = sampled_df[sampled_df['label'] == 0].index[0]

#sampled_df = sampled_df.drop([row_to_drop_PRTSIR, row_to_drop_ICT], axis=0).reset_index(drop=True)
#len(sampled_df)

In [0]:
#calculate total input tokens
input_tokens = sampled_df['Text'].apply(lambda x: len(x.split()) + 1).sum()
print("input_tokens:", input_tokens)

#Calculate the total summary tokens (from API output)
summary_tokens = 50 * len(sampled_df)
print("summary tokens:", summary_tokens)

#Calculate the total tokens used (input + output)
total_tokens_used = input_tokens + summary_tokens
print("total tokens used:", total_tokens_used)

In [0]:
#sampled_PRTSIR = sample_class(df, 'label', 1, 5)
#sampled_ICT = sample_class(df, 'label', 0, 5)

#subset_df = pd.concat([sampled_PRTSIR, sampled_ICT]).reset_index(drop=True)
#subset_df

In [0]:
import openai

In [0]:
import string

def summarize_text(text, api_key):
    openai.api_key = api_key

    max_tokens = 4000  # Maximum number of tokens allowed in one request
    text_len = len(text.split())
    num_batches = (text_len - 1) // max_tokens + 1  # Calculate the number of batches needed

    summary = ""  # Initialize the summary

    # Loop through the text in batches
    for i in range(num_batches):
        start_idx = i * max_tokens
        end_idx = min((i + 1) * max_tokens, text_len)
        text_batch = " ".join(text.split()[start_idx:end_idx])

        # Send the first 4000 tokens of the text batch to OpenAI API for summarization
        response = openai.Completion.create(
            engine="text-davinci-002",
            prompt=f"Please summarize the following text: {text_batch[:4000]}",
            max_tokens=100,
            n=1,
            stop=None,
            temperature=0.5,
        )

        # Append the summary of the batch to the overall summary
        batch_summary = response.choices[0].text.strip()
        batch_summary = batch_summary.translate(str.maketrans("", "", string.punctuation)).lower()
        summary += batch_summary

    return summary

In [0]:
#pass in key and function
api_key = .... #enter key here
sampled_df['Text_Summarization'] = sampled_df['Text'].apply(lambda text: summarize_text(text, api_key))

In [0]:
#subset_df

In [0]:
sampled_df

In [0]:
# Compute recipe outputs from inputs
# TODO: Replace this part by your actual code that computes the output, as a Pandas dataframe
# NB: DSS also supports other kinds of APIs for reading and writing data. Please see doc.

# Write recipe outputs
output_dataset = dataiku.Dataset("OpenAI_API_Text_Summarization")
output_dataset.write_with_schema(sampled_df)