In [1]:
import pandas as pd
import asyncio
from langchain_core.documents import Document
import openai
import nest_asyncio
from langchain.text_splitter import CharacterTextSplitter

In [2]:
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
import elasticsearch
import getpass
from functools import partial
from datasets import load_dataset

In [3]:
from langchain_elasticsearch import ElasticsearchRetriever
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import format_document
from langchain.prompts.prompt import PromptTemplate
import os

In [4]:
nest_asyncio.apply()

In [5]:
ds = pd.read_csv('/Users/jessgarson/Downloads/tour_data.csv')

In [6]:
ds

Unnamed: 0,title,date,link,text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...


In [7]:
async def translate_text(text):
    document = Document(page_content=text)
    loop = asyncio.get_event_loop()
    response = await loop.run_in_executor(
        None,
        partial(
            openai.chat.completions.create,
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a translator."},
                {
                    "role": "user",
                    "content": f"Translate the following text to English: {document.page_content}",
                },
            ],
        ),
    )
    translated_text = response.choices[0].message.content.strip()
    return translated_text

In [8]:
async def translate_dataframe(ds, column_name):
    tasks_column = [translate_text(text) for text in ds[column_name]]
    translated_texts = await asyncio.gather(*tasks_column)
    ds["translated_" + column_name] = translated_texts
    return ds

In [9]:
loop = asyncio.get_event_loop()
loop.run_until_complete(translate_dataframe(ds, "text"))

Unnamed: 0,title,date,link,text,translated_text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...,"Time: Saturday, June 1st, 19:00-22:30 \nLocat..."
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...,2024 International Conference on Live Coding ...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...,"Sana Zanhuerros, a source of vitality for inst..."
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...,Date: 5.31 8:30 PM \nVenue: Yuyintang \nLine...


In [16]:
es_client = elasticsearch.Elasticsearch(
    getpass.getpass("Host: "),
    api_key=getpass.getpass("API Key: "),
)

Host:  ········
API Key:  ········


In [17]:
index_name = "raid"

if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

es_client.indices.create(index=index_name)
print(f"Created new index: {index_name}")

Created new index: raid


In [18]:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

In [22]:
elastic_vector_search = ElasticsearchStore(
    index_name=index_name,
    es_connection=es_client,
    embedding=embedding,
)

In [25]:
translated_texts = ds["translated_text"].tolist()
combined_text = "\n".join(translated_texts)
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
docs = text_splitter.split_documents([Document(page_content=combined_text)])

In [26]:
elastic_vector_search.add_documents(docs)

['0ec97922-f1e1-41f6-9264-c8e20cb87b28',
 'da5a451d-0567-429c-b534-7f2e47fe3c61',
 '8b35571e-80dd-44fa-80e4-79b331c7bcf8',
 '9088eb26-74a0-43e0-8d4a-ab0f6bb0d0bb',
 'b2a9d567-040f-4be4-98a4-1472ecec509e',
 'f191bd44-10b3-427c-8501-f507f16780b0',
 'b4124365-de2e-42cc-9cc3-d75c1bacffb1',
 '9e92955a-1ede-4e34-a21d-0ac82e411179']

In [27]:
def build_query(query):
    return {
        "retriever": {
            "standard": {
                "query": {
                    "multi_match": {
                        "query": query,
                        "fields": [
                            "text"
                        ]
                    }
                }
            }
        }
    }

In [28]:
index_source_fields = {
    "raid": "text"
}

retriever = ElasticsearchRetriever(
    index_name="raid",
    body_func=build_query,
    content_field=index_source_fields,
    es_client=es_client
)

In [33]:
model = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model_name="gpt-4o")

In [34]:
ANSWER_PROMPT = ChatPromptTemplate.from_template(
    """
  Instructions:
  
  - You are an assistant for question-answering tasks.
  - Answer questions truthfully and factually using only the context presented.
  - If you don't know the answer, just say that you don't know, don't make up an answer.
  - You must always cite the document where the answer was extracted using inline academic citation style [], using the position.
  - Use markdown format for code examples.
  - You are correct, factual, precise, and reliable.
  

  Context:
  {context}

  
  """
)

In [35]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

_context = {
    "context": retriever | _combine_documents,
    "question": RunnablePassthrough(),
}

chain = _context | ANSWER_PROMPT | model | StrOutputParser()

In [36]:
ans = chain.invoke("What is ICLC?")
print("---- Answer ----")
print(ans)

---- Answer ----
The International Conference on Live Coding (ICLC) is an event dedicated to the study and practice of using computer code as an element of live performance. It has been held since 2015 and brings together an international community to explore various perspectives on live coding, including technology, philosophy, education, and governance [1]. The 2024 edition of the conference will be held in Shanghai, China, at NYU Shanghai, the Cadillac Shanghai Concert Hall, and System from May 30 to June 1 [1].

One of the highlights of the event is the first-ever international conference presentation in Asia, featuring a variety of activities such as lectures, workshops, concerts, and algorithmic dance parties. More than 27 international artists and groups are expected to participate, with over 30 activities planned throughout the event [2].

Sana Zanhuerros, a multimedia artist and core member of the New York-based sound creation ensemble "Computer," will be one of the featured a

In [37]:
system_answer = chain.invoke("What happened at SYSTEM?")
print("---- Answer ----")
print(ans)

---- Answer ----
The International Conference on Live Coding (ICLC) is an event dedicated to the study and practice of using computer code as an element of live performance. It has been held since 2015 and brings together an international community to explore various perspectives on live coding, including technology, philosophy, education, and governance [1]. The 2024 edition of the conference will be held in Shanghai, China, at NYU Shanghai, the Cadillac Shanghai Concert Hall, and System from May 30 to June 1 [1].

One of the highlights of the event is the first-ever international conference presentation in Asia, featuring a variety of activities such as lectures, workshops, concerts, and algorithmic dance parties. More than 27 international artists and groups are expected to participate, with over 30 activities planned throughout the event [2].

Sana Zanhuerros, a multimedia artist and core member of the New York-based sound creation ensemble "Computer," will be one of the featured a