In [9]:
import numpy as np

import chromadb
from chromadb.utils import embedding_functions

from chromadb.config import Settings

chroma_client = chromadb.HttpClient(host='localhost', port = 8083, settings=Settings(allow_reset=True, anonymized_telemetry=False))

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection = chroma_client.get_or_create_collection(name="transcripts_mililm_l6_v2", embedding_function=sentence_transformer_ef)


In [25]:
results = collection.query(
    query_texts=["transcript"],
    n_results=10,
    where={"year": 2022}
)

In [26]:
def split(data):
    documents = [i['content'] for i in data]
    ids = [str(i['year'])+'Q'+str(i['quarter'])+i['symbol'] for i in data]
    metadatas = [{'symbol':i['symbol'], 'quarter':i['quarter'], \
        'year':i['year']} for i in data]

    collection.add(
                    documents=documents,
                    metadatas=metadatas,
                    ids=ids)

In [87]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(transcripts, metadatas):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

    texts = text_splitter.create_documents(transcripts, metadatas)
    texts = text_splitter.split_documents(texts)
    
    return texts

In [88]:
meta_datas = [i for i in results['metadatas'][0]]
df = pd.DataFrame(meta_datas)
df['key'] = results['ids'][0]
df

Unnamed: 0,quarter,symbol,year,key
0,2,GCO,2022,2022Q2GCO
1,1,ETR,2022,2022Q1ETR
2,2,SAPLF,2022,2022Q2SAPLF
3,1,SMBC,2022,2022Q1SMBC
4,1,ADPT,2022,2022Q1ADPT
5,3,LZB,2022,2022Q3LZB
6,1,CADE,2022,2022Q1CADE
7,1,AXTA,2022,2022Q1AXTA
8,1,NWN,2022,2022Q1NWN
9,1,BRC,2022,2022Q1BRC


In [89]:
meta_datas = list(df.T.to_dict().values())

In [90]:
splits = split_docs(results['documents'][0], meta_datas)

In [92]:
collection = chroma_client.get_or_create_collection(name="transcripts_mililm_l6_test_split", embedding_function=sentence_transformer_ef)

In [116]:
ids = [f"{x.metadata['key']}" for x in splits]

In [117]:
ids_df = pd.DataFrame(ids)

In [118]:
ids_df = ids_df.reset_index()

In [131]:
ids_df['key'] = ids_df[0] + '_' + ids_df['index'].astype('str')

In [133]:
ids = ids_df['key'].to_list()

In [144]:
[x.page_conten splits]

[Document(page_content="Disclaimer*: This transcript is designed to be used alongside the freely available audio recording on this page. Timestamps within the transcript are designed to help you navigate the audio should the corresponding text be unclear. The machine-assisted output provided is unaudited and is designed as a guide.:\nOperator: 00:05 Good day everyone and welcome to the Genesco’s Second quarter Fiscal twenty twenty two conference call. Just a reminder today's call is being recorded. 00:14 I will now turn the call over to Dave Slater, Vice President of FP&A and Investor Relations. Please go ahead, sir.", metadata={'quarter': 2, 'symbol': 'GCO', 'year': 2022, 'key': '2022Q2GCO'}),
 Document(page_content="Dave Slater: 00:21 Good morning, everyone, and thank you for joining us to discuss our second quarter fiscal twenty twenty two results. Participants on the call expect to make forward looking statements. These statements reflect the participant’s expectations as of today,

In [141]:
collection.add(documents=splits, ids=ids)

TypeError: 'Document' object is not subscriptable

In [35]:
[i for i in results['ids'][0]]

['2022Q2GCO',
 '2022Q1ETR',
 '2022Q2SAPLF',
 '2022Q1SMBC',
 '2022Q1ADPT',
 '2022Q3LZB',
 '2022Q1CADE',
 '2022Q1AXTA',
 '2022Q1NWN',
 '2022Q1BRC']

In [23]:
results

{'ids': [['2022Q2GCO',
   '2022Q1ETR',
   '2022Q2SAPLF',
   '2022Q1SMBC',
   '2022Q1ADPT',
   '2022Q3LZB',
   '2022Q1CADE',
   '2022Q1AXTA',
   '2022Q1NWN',
   '2022Q1BRC']],
 'distances': [[1.0787485837936401,
   1.0878193378448486,
   1.094926118850708,
   1.1140602827072144,
   1.1363320350646973,
   1.1397255659103394,
   1.1410999298095703,
   1.1438720226287842,
   1.1509253978729248,
   1.1560888290405273]],
 'embeddings': None,
 'metadatas': [[{'quarter': 2, 'symbol': 'GCO', 'year': 2022},
   {'quarter': 1, 'symbol': 'ETR', 'year': 2022},
   {'quarter': 2, 'symbol': 'SAPLF', 'year': 2022},
   {'quarter': 1, 'symbol': 'SMBC', 'year': 2022},
   {'quarter': 1, 'symbol': 'ADPT', 'year': 2022},
   {'quarter': 3, 'symbol': 'LZB', 'year': 2022},
   {'quarter': 1, 'symbol': 'CADE', 'year': 2022},
   {'quarter': 1, 'symbol': 'AXTA', 'year': 2022},
   {'quarter': 1, 'symbol': 'NWN', 'year': 2022},
   {'quarter': 1, 'symbol': 'BRC', 'year': 2022}]],
 'documents': [["Disclaimer*: This tran

In [16]:
results.keys()

dict_keys(['ids', 'distances', 'embeddings', 'metadatas', 'documents', 'uris', 'data'])

In [22]:
results['documents']

[["Disclaimer*: This transcript is designed to be used alongside the freely available audio recording on this page. Timestamps within the transcript are designed to help you navigate the audio should the corresponding text be unclear. The machine-assisted output provided is unaudited and is designed as a guide.:\nOperator: 00:05 Good day everyone and welcome to the Genesco’s Second quarter Fiscal twenty twenty two conference call. Just a reminder today's call is being recorded. 00:14 I will now turn the call over to Dave Slater, Vice President of FP&A and Investor Relations. Please go ahead, sir.\nDave Slater: 00:21 Good morning, everyone, and thank you for joining us to discuss our second quarter fiscal twenty twenty two results. Participants on the call expect to make forward looking statements. These statements reflect the participant’s expectations as of today, but actual results could be different. 00:38 Genesco refers you to this morning's earnings release and the company's Sec f