In [None]:
from openai import OpenAI
import pandas as pd
import os
import json
from my_functions.functions_data_enrichment import NaicsProcessor, JobInformationProcessor, ToolConsumptionProcessor, BatchProcessor

In [None]:
client = OpenAI(api_key="")

# Load data

In [None]:
occupation_data = pd.read_pickle("data/processed_data/pkl/df_cleaned.pickle")
occupation_data

In [None]:
df_occupation_industry = occupation_data.groupby(['OCC_CODE', 'OCC_TITLE', 'NAICS_CODE', 'NAICS_TITLE'])['emp_occupation'].sum().reset_index()
df_occupation_industry

In [None]:
original_df = NaicsProcessor.expand_naics_and_split_value(df_occupation_industry, 'data/original_data/xlsx/occu.xlsx')
original_df

# Industry specific job information

In [None]:
# Create an instance of JobInformationProcessor with the original dataframe.
# Generate job tasks and save them to a JSONL file at the specified path.

job_processor = JobInformationProcessor(original_df)
job_tasks = job_processor.generate_job_tasks("data/processed_data/json/job_tasks.jsonl")

In [None]:
len(job_tasks)

In [None]:
job_tasks[0]

In [None]:
# Process a batch using the batch_processor, specifying the client and file paths for job tasks, results, and batch ID.
# The process_batch method generates results and stores them in the provided files.
batch_processor = BatchProcessor()
batch_id = batch_processor.process_batch(client, task_file_name = "data/processed_data/json/job_tasks.jsonl",
                                         result_file_name="data/processed_data/json/job_results.jsonl",
                                         batch_id_file="data/processed_data/batch_ids/job_batch_id.txt")

In [None]:
# Load the saved batch ID from the specified file to retrieve the previously stored batch identifier.
# The loaded batch ID is stored in the variable 'saved_batch_id'.

saved_batch_id = batch_processor.load_batch_id(batch_id_file="data/processed_data/batch_ids/job_batch_id.txt")
saved_batch_id

In [None]:
# If a saved batch ID is available, resume processing by downloading the results associated with that batch.
# The results are retrieved using the saved batch ID and stored in the specified result file.

if saved_batch_id:
    # After some time or later in the notebook, you can resume with the saved batch_id:
    result_data = batch_processor.download_results(client, saved_batch_id, result_file_name="data/processed_data/json/job_results.jsonl")
       

In [None]:
result_data[:1000]

In [None]:
# Process the original DataFrame and save the results in both a JSONL file and a Pickle file.
# The processed results are stored in 'job_results.jsonl' and 'job_results.pkl' for further use.

result_df = batch_processor.process_and_save_results_job(
    original_df,  # Beispiel für den originalen DataFrame
    result_file_name="data/processed_data/json/job_results.jsonl",  # Beispiel-Dateiname
    pickle_file_name="data/processed_data/pkl/job_results.pkl"  # Name der Pickle-Datei
)

In [None]:
result_df

# Tool consumption estimation

In [None]:
# Load the processed job results from the Pickle file into a DataFrame.

job_results = pd.read_pickle('data/processed_data/pkl/job_results.pkl')
print(len(job_results))
job_results.head(5)

In [None]:
# Create an instance of ToolConsumptionProcessor with the first 100 rows of the job results.
# Generate tool consumption tasks and save them to a JSONL file at the specified path.

tool_consunption_processor = ToolConsumptionProcessor(job_results)
tool_consunption_tasks = tool_consunption_processor.generate_tool_consumption_tasks(task_file_path="data/processed_data/json/tool_consumption_tasks.jsonl")

In [None]:
len(tool_consunption_tasks)

In [None]:
tool_consunption_tasks[0]

In [None]:
# Process a batch of tool consumption tasks using the batch_processor, specifying the task, result, and batch ID file paths.
# The process_batch method generates results and stores them in the provided files, including the batch ID.

batch_id = batch_processor.process_batch(client, task_file_name="data/processed_data/json/tool_consumption_tasks.jsonl", 
                                         result_file_name="data/processed_data/json/tool_consumption_results.jsonl",
                                         batch_id_file="data/processed_data/batch_ids/tool_consumption_batch_id.txt")

In [None]:
# Load the saved batch ID from the specified file to retrieve the previously stored tool consumption batch identifier.
# The loaded batch ID is stored in the variable 'saved_batch_id'.

saved_batch_id = batch_processor.load_batch_id(batch_id_file="data/processed_data/batch_ids/tool_consumption_batch_id.txt")
saved_batch_id

In [None]:
if saved_batch_id:
    # After some time or later in the notebook, you can resume with the saved batch_id:
    result_data = batch_processor.download_results(client, saved_batch_id, result_file_name="data/processed_data/json/tool_consumption_results.jsonl")

In [None]:
# Define the file paths for the JSONL and Pickle files, and create an example DataFrame.
# Call the process_and_save_results_tool_consumption method to process the tool consumption results and save them in the specified files.

jsonl_file = "data/processed_data/json/tool_consumption_results.jsonl"
pickle_file = "data/processed_data/pkl/tool_consumption_results.pkl"

# Methode aufrufen
df_tool_consumption = BatchProcessor.process_and_save_results_tool_consumption(jsonl_file, job_results, pickle_file)

In [None]:
df_tool_consumption