### Installations

In [None]:
!pip install openai

In [None]:
!pip install chromadb

In [None]:
!pip install llama_index

In [None]:
!pip install llama-index-core

In [None]:
!pip install sentence-transformers

In [None]:
!pip install python-dotenv

In [None]:
%pip install llama-index-vector-stores-chroma

In [None]:
%pip install llama-index-embeddings-langchain

In [None]:
!pip install llama-index-llms-openai

In [None]:
%pip install llama-index-llms-openai

In [None]:
!pip install langchain_community

In [None]:
!pip install langchain_core

###Imports

In [None]:
import pandas as pd
import sys
import argparse
import os
import json
import pickle
import glob
import time
from datetime import datetime
import openai

In [None]:
import chromadb
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Prompt
from llama_index.llms.openai import OpenAI
from langchain_core.prompts import PromptTemplate

In [None]:
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


###Functions

In [None]:
args = argparse.Namespace(config_path="/content/drive/My Drive/DS340/config.json")

In [None]:
with open(args.config_path) as json_file:
  config_dict = json.load(json_file)

#with open(args2.questions_path) as json_file:
  #questions_dict = json.load(json_file)

In [None]:
CHUNK_SIZE = 512
CHUNK_OVERLAP = 32

In [None]:
# Works
def initialize_and_return_models(config_dict):
    os.environ["OPENAI_API_KEY"] = config_dict['openai_api_key']
    load_dotenv("openai.env")
    openai.api_key=os.getenv('OPENAI_API_KEY')
    llm = OpenAI(model='gpt-3.5-turbo', temperature=0.5)
    embedding_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    )
    return llm, embedding_model

In [None]:
# Works
def load_target_dfs(config_dict):
    with open(config_dict['targets_train_df_path'], 'rb') as handle:
        df_train = pickle.load(handle)
    with open(config_dict['targets_test_df_path'], 'rb') as handle:
        df_test = pickle.load(handle)
    #Convert report_date column to string representation
    df_train['report_date'] = df_train['report_date'].apply(lambda x: x.date().strftime('%Y-%m-%d'))
    df_train.reset_index(drop=True, inplace=True)
    df_test['report_date'] = df_test['report_date'].apply(lambda x: x.date().strftime('%Y-%m-%d'))
    df_test.reset_index(drop=True, inplace=True)
    return df_train, df_test

In [None]:
# Works
def get_systemprompt_template(config_dict):
    chat_text_qa_msgs = [
        SystemMessagePromptTemplate.from_template(
            config_dict['llm_system_prompt']
        ),
        HumanMessagePromptTemplate.from_template(
            "Context information is below.\n"
            "---------------------\n"
            "{context_str}\n"
            "---------------------\n"
            "Given the context information, "
            "answer the question: {query_str}\n"
        ),
    ]
    chat_text_qa_msgs_lc = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
    #text_qa_template = Prompt.from_langchain_prompt(chat_text_qa_msgs_lc)
    #text_qa_template = PromptTemplate.from_template(str(chat_text_qa_msgs_lc))
    text_qa_template = Prompt(str(chat_text_qa_msgs_lc))
    return text_qa_template

In [None]:
get_systemprompt_template(config_dict)

PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template="input_variables=['context_str', 'query_str'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a highly experienced financial analyst. Respond to the question presented to you by assigning a confidence score ranging from 1 to 100 based on the available information. Provide your response in JSON format with only one key called score.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context_str', 'query_str'], template='Context information is below.\\n---------------------\\n{context_str}\\n---------------------\\nGiven the context information, answer the question: {query_str}\\n'))]")

In [None]:
# Requires Updated API
def get_gpt_generated_feature_dict(query_engine, questions_dict):
    '''
    Returns:
        A dictionary with keys as question identifiers and value as GPT scores.
    '''
    response_dict = {}
    for feature_name, question in questions_dict.items():
        #Sleep for a short duration, not to exceed openai rate limits.
        time.sleep(0.2)
        response = query_engine.query(question)
        try:
          response_dict[feature_name] = int(eval(response.response)['score'])
        except:
          continue
    return response_dict

In [None]:
def load_index(llm, embedding_model, base_embeddings_path, symbol, ar_date):
    '''
    Function to load the embeddings that were saved using embeddings_save.py
    '''
    print(os.path.join(base_embeddings_path, symbol, ar_date))
    db = chromadb.PersistentClient(path=os.path.join(base_embeddings_path, symbol, ar_date))
    chroma_collection = db.get_collection("ar_date")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    service_context = ServiceContext.from_defaults(embed_model=embedding_model,
                                                   llm=llm,
                                                   chunk_size = CHUNK_SIZE,
                                                   chunk_overlap=CHUNK_OVERLAP)
    index = VectorStoreIndex.from_vector_store(
                vector_store,
                service_context=service_context,
            )
    return index

In [None]:
# Works
def load_index2(llm, embedding_model, base_embeddings_path, symbol, ar_date):
    '''
    Function to load the embeddings that were saved using embeddings_save.py
    '''
    try:
        db = chromadb.PersistentClient(path=os.path.join(base_embeddings_path, symbol, ar_date))
        chroma_collection = db.get_collection("ar_date")
    except Exception as e:
        print("Error:", e)
        #return "DNE"  # Return "DNE" if there is an error

    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    service_context = ServiceContext.from_defaults(embed_model=embedding_model,
                                                   llm=llm,
                                                   chunk_size = CHUNK_SIZE,
                                                   chunk_overlap=CHUNK_OVERLAP)
    index = VectorStoreIndex.from_vector_store(
                vector_store,
                service_context=service_context,
            )
    return index

In [None]:
# Works
def load_query_engine(index, text_qa_template):
    return index.as_query_engine(text_qa_template=text_qa_template)

In [None]:
# Works
def are_features_generated(base_path, symbol, ar_date):
    '''
    Function to check if the features df has already been created before.
    '''
    df_name = 'df_{}_{}.pickle'.format(symbol, ar_date)
    full_path = os.path.join(base_path, df_name)
    if os.path.exists(full_path):
        return True
    return False

In [None]:
def save_features(df, llm, embedding_model, config_dict, questions_dict,
                  embeddings_directory, features_save_directory):
    '''
    Function to iteratively save features as a df with single row.
    '''
    for i in df.index:
        start_time = time.time()
        curr_series = df.loc[i]
        symbol = curr_series['symbol']
        ar_date = curr_series['report_date']
        if are_features_generated(features_save_directory, symbol, ar_date):
            continue
        index = load_index(llm, embedding_model, embeddings_directory, symbol, ar_date)
        text_qa_template = get_systemprompt_template(config_dict)
        query_engine = load_query_engine(index, text_qa_template)
        #Get feature scores as dictionary
        gpt_feature_dict = get_gpt_generated_feature_dict(query_engine, questions_dict)
        #Convert dictionary to dataframe
        gpt_feature_df = pd.DataFrame.from_dict(gpt_feature_dict, orient='index').T
        gpt_feature_df.columns = ['feature_{}'.format(c) for c in gpt_feature_df.columns]
        gpt_feature_df['meta_symbol'] = symbol
        gpt_feature_df['meta_report_date'] = ar_date
        with open(os.path.join(features_save_directory, 'df_{}_{}.pickle'.format(symbol, ar_date)), 'wb') as handle:
            pickle.dump(gpt_feature_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Completed: {} in {:.2f}s".format(i, time.time()-start_time))

In [None]:
def save_consolidated_df(config_dict, questions_dict, targets_df,
                         features_save_directory, final_df_save_path):
    df_paths_list = [file for file in glob.glob(os.path.join(features_save_directory, '*')) \
                  if os.path.isfile(file)]
    feature_df_full = pd.DataFrame()
    feature_cols = list(questions_dict.keys())
    feature_cols = ['feature_{}'.format(f) for f in feature_cols]
    meta_cols = ['meta_symbol', 'meta_report_date']
    for df_path in df_paths_list:
        with open(df_path, 'rb') as handle:
            gpt_feature_df = pickle.load(handle)
        gpt_feature_df = gpt_feature_df.loc[:, feature_cols + meta_cols].copy()
        feature_df_full = pd.concat([feature_df_full, gpt_feature_df], ignore_index=True)
    #Convert meta_report_date column to datetime format
    feature_df_full['meta_report_date'] = feature_df_full['meta_report_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    merged_df = pd.merge(feature_df_full, targets_df, left_on=['meta_symbol', 'meta_report_date'],
                        right_on=['symbol', 'report_date'], how='inner')
    #Transform features in range [0,1]
    merged_df[feature_cols] = merged_df[feature_cols]/100.0
    with open(final_df_save_path, 'wb') as handle:
        pickle.dump(merged_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
'''
def main(args):
    with open(args.config_path) as json_file:
        config_dict = json.load(json_file)
    with open(args.questions_path) as json_file:
        questions_dict = json.load(json_file)

    df_train, df_test = load_target_dfs(config_dict)
    llm, embedding_model = initialize_and_return_models(config_dict)

    save_features(df_train, llm, embedding_model, config_dict, questions_dict,
                  embeddings_directory=config_dict['embeddings_for_training_directory'],
                  features_save_directory=config_dict['feature_train_df_save_directory'])
    save_features(df_test, llm, embedding_model, config_dict, questions_dict,
                  embeddings_directory=config_dict['embeddings_for_testing_directory'],
                  features_save_directory=config_dict['feature_test_df_save_directory'])

    save_consolidated_df(config_dict, questions_dict, df_train,
                         features_save_directory=config_dict['feature_train_df_save_directory'],
                         final_df_save_path=config_dict['final_train_df_save_path'])
    save_consolidated_df(config_dict, questions_dict, df_test,
                         features_save_directory=config_dict['feature_test_df_save_directory'],
                         final_df_save_path=config_dict['final_test_df_save_path'])
'''

In [None]:
args = argparse.Namespace(config_path="/content/drive/My Drive/DS340/config.json")

In [None]:
args2 = argparse.Namespace(questions_path="/content/drive/My Drive/DS340/questions_dict.json")

### Modified Implementation

In [None]:
with open(args.config_path) as json_file:
  config_dict = json.load(json_file)

with open(args2.questions_path) as json_file:
  questions_dict = json.load(json_file)

In [None]:
train_path = "/content/drive/My Drive/DS340/df_train.csv"
test_path = "/content/drive/My Drive/DS340/df_test.csv"

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:
df_train.shape

(392, 24)

In [None]:
df_train.columns

Index(['target_max', 'target_min', 'target_3m', 'target_6m', 'target_9m',
       'target_12m', 'sp500_max', 'sp500_min', 'sp500_3m', 'sp500_6m',
       'sp500_9m', 'sp500_12m', 'report_date', 'start_date', 'end_date',
       'symbol', 'era', 'target_max_normalised', 'target_min_normalised',
       'target_3m_normalised', 'target_6m_normalised', 'target_9m_normalised',
       'target_12m_normalised', 'target_ml'],
      dtype='object')

In [None]:
llm, embedding_model = initialize_and_return_models(config_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
symbol = "MAT"
ar_date = "2012-02-23"

In [None]:
index= load_index(llm, embedding_model, config_dict['embeddings_for_training_directory'], symbol, ar_date)

In [None]:
# ACIW was moved from previous embeddings, this shows that previous embeddings are not corrupted. Chromasplite file appears after running this
index= load_index2(llm, embedding_model, config_dict['embeddings_for_training_directory'], symbol, ar_date)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


In [None]:
text_qa_template = get_systemprompt_template(config_dict)
query_engine = load_query_engine(index, text_qa_template)

In [None]:
gpt_feature_dict = get_gpt_generated_feature_dict(query_engine, questions_dict)

In [None]:
def save_features2(df, llm, embedding_model, config_dict, questions_dict,
                               embeddings_directory, features_save_directory,
                               start_index=0, end_index=None):
    '''
    Function to iteratively save features as a df with single row.
    '''
    num_rows = len(df)

    if end_index is None:
        end_index = num_rows

    for i in range(start_index, min(end_index, num_rows)):
        start_time = time.time()
        curr_series = df.loc[i]
        symbol = curr_series['symbol']
        ar_date = curr_series['report_date']
        if are_features_generated(features_save_directory, symbol, ar_date):
            continue
        try:
          index = load_index2(llm, embedding_model, embeddings_directory, symbol, ar_date)
        except:
          continue
        text_qa_template = get_systemprompt_template(config_dict)
        try:
          query_engine = load_query_engine(index, text_qa_template)
        except:
          continue
        # Get feature scores as dictionary
        gpt_feature_dict = get_gpt_generated_feature_dict(query_engine, questions_dict)
        # Convert dictionary to dataframe
        gpt_feature_df = pd.DataFrame.from_dict(gpt_feature_dict, orient='index').T
        gpt_feature_df.columns = ['feature_{}'.format(c) for c in gpt_feature_df.columns]
        gpt_feature_df['meta_symbol'] = symbol
        gpt_feature_df['meta_report_date'] = ar_date
        with open(os.path.join(features_save_directory, 'df_{}_{}.pickle'.format(symbol, ar_date)), 'wb') as handle:
          pickle.dump(gpt_feature_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print("Completed: {} in {:.2f}s".format(i, time.time()-start_time))


In [None]:
# 392
def save_features_train(start_index, end_index):
  save_features2(df_train, llm, embedding_model, config_dict, questions_dict, embeddings_directory=config_dict['embeddings_for_training_directory'],
                 features_save_directory=config_dict['feature_train_df_save_directory'], start_index = start_index, end_index = end_index)

In [None]:
# 201
def save_features_test(start_index, end_index):
  save_features2(df_test, llm, embedding_model, config_dict, questions_dict, embeddings_directory=config_dict['embeddings_for_testing_directory'],
                 features_save_directory=config_dict['feature_test_df_save_directory'], start_index = start_index, end_index = end_index)

### Embedding

In [None]:
save_features_train(1,10)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 1 in 50.13s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 2 in 10.29s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 3 in 15.40s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 4 in 11.89s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 5 in 8.04s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 6 in 12.29s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 7 in 8.73s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 8 in 12.13s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


In [None]:
save_features_train(11,21)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 11 in 13.24s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 12 in 19.51s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 13 in 10.52s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 14 in 10.58s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 15 in 15.12s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 16 in 11.63s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 17 in 13.22s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 18 in 12.75s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 19 in 13.43s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 20 in 13.40s


In [None]:
save_features_train(21,41)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 21 in 9.77s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 22 in 10.94s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 23 in 15.67s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 24 in 10.34s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 25 in 12.27s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 26 in 13.83s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 27 in 10.75s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 28 in 10.43s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 29 in 12.87s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 30 in 10.38s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 31 in 12.34s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 32 in 9.87s
Error: Collection ar_date does not exist.


AttributeError: 'str' object has no attribute 'as_query_engine'

In [None]:
save_features_train(33,41)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(41,51)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 47 in 10.66s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 48 in 18.55s
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(71,91)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 86 in 11.40s
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(102,200)

### Chroma availblity testing

In [None]:
with open(args.config_path) as json_file:
  config_dict = json.load(json_file)

In [None]:
config_dict['feature_test_df_save_directory']

'/content/drive/My Drive/DS340/test_features2'

In [None]:
train_path = "/content/drive/My Drive/DS340/new_df_train.csv"
test_path = "/content/drive/My Drive/DS340/new_df_test.csv"

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

In [None]:
df_test.shape

(201, 24)

In [None]:
def save_features_train(start_index, end_index):
  save_features2(df_train, llm, embedding_model, config_dict, questions_dict, embeddings_directory=config_dict['embeddings_for_training_directory'],
                 features_save_directory=config_dict['feature_train_df_save_directory'], start_index = start_index, end_index = end_index)

In [None]:
def save_features_test(start_index, end_index):
  save_features2(df_test, llm, embedding_model, config_dict, questions_dict, embeddings_directory=config_dict['embeddings_for_testing_directory'],
                 features_save_directory=config_dict['feature_test_df_save_directory'], start_index = start_index, end_index = end_index)

In [None]:
save_features_train(0,10)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 0 in 137.96s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 1 in 132.74s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 2 in 111.68s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 3 in 126.50s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 4 in 139.41s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 5 in 103.80s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 6 in 124.90s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 7 in 74.21s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 8 in 112.11s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 9 in 131.96s


In [None]:
save_features_train(10,20)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 10 in 101.04s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 11 in 130.73s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 12 in 90.97s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 13 in 131.59s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 14 in 98.20s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 15 in 110.61s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 16 in 226.95s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 17 in 118.36s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 18 in 131.54s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 19 in 112.00s


In [None]:
save_features_train(20,40)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 20 in 110.56s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 21 in 128.12s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 22 in 116.36s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 23 in 123.70s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 24 in 119.74s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 25 in 178.51s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 26 in 96.10s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 27 in 98.27s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 28 in 83.45s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 29 in 92.86s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 30 in 115.93s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 31 in 101.31s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 32 in 122.46s
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(35,45)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(45,66)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 47 in 75.98s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 48 in 150.10s
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(66,86)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(86,114)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 86 in 117.64s
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Co

In [None]:
save_features_train(114,134)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


In [None]:
save_features_train(134,164)

Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 139 in 117.50s
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 141 in 220.66s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 142 in 107.46s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 143 in 153.13s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 144 in 82.23s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 145 in 143.16s
Error: Collection ar_date does not exist.


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 147 in 91.43s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 148 in 137.96s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 149 in 151.76s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 150 in 91.27s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 151 in 95.65s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 152 in 79.87s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 153 in 113.14s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 154 in 101.35s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 155 in 110.68s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 156 in 84.44s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 157 in 111.49s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 158 in 109.61s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 159 in 212.79s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 160 in 114.03s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 161 in 93.81s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 162 in 185.67s


In [None]:
df_train.shape

(163, 24)

In [None]:
save_features_test(0,10)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 0 in 99.28s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 1 in 151.43s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 2 in 102.66s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 3 in 105.49s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 4 in 158.53s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 5 in 174.90s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 6 in 106.38s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 7 in 101.67s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 8 in 162.66s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 9 in 109.04s


In [None]:
save_features_test(10,20)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 10 in 100.91s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 11 in 97.41s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 12 in 149.21s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 13 in 113.06s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 14 in 92.41s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 15 in 102.67s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 16 in 198.26s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 17 in 97.06s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 18 in 150.08s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 19 in 144.73s


In [None]:
save_features_test(20,30)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 20 in 109.05s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 21 in 105.40s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 22 in 108.15s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 23 in 106.57s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 24 in 90.70s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 25 in 161.88s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 26 in 137.07s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 27 in 114.29s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 28 in 123.04s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 29 in 259.31s


In [None]:
save_features_test(30,40)

  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 30 in 113.61s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 31 in 119.45s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 32 in 121.51s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 33 in 166.08s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 34 in 121.48s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 35 in 96.35s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 36 in 239.82s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 37 in 109.56s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 38 in 123.36s


  service_context = ServiceContext.from_defaults(embed_model=embedding_model,


Completed: 39 in 151.76s


### Save Final DataFrames

In [None]:
#save_consolidated_df(config_dict, questions_dict, df_train, features_save_directory=config_dict['feature_train_df_save_directory'],final_df_save_path=config_dict['final_train_df_save_path'])

KeyError: "['feature_management'] not in index"

In [None]:
#save_consolidated_df(config_dict, questions_dict, df_test, features_save_directory=config_dict['feature_test_df_save_directory'], final_df_save_path=config_dict['final_test_df_save_path'])