In [1]:
import pandas as pd
import asyncio
from langchain_community.document_transformers import DoctranTextTranslator
from langchain_core.documents import Document
import openai
import nest_asyncio
from langchain.text_splitter import CharacterTextSplitter

In [2]:
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
import elasticsearch
import getpass
from functools import partial

In [3]:
nest_asyncio.apply()

In [4]:
df = pd.read_csv('/Users/jessgarson/Downloads/tour_data.csv')

In [5]:
df

Unnamed: 0,title,date,link,text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...


In [6]:
qa_translator = DoctranTextTranslator(language="english", openai_api_model="gpt-4o")

In [7]:
async def translate_text(text):
    document = Document(page_content=text)
    loop = asyncio.get_event_loop()
    response = await loop.run_in_executor(
        None,
        partial(
            openai.chat.completions.create,
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a translator."},
                {"role": "user", "content": f"Translate the following text to English: {document.page_content}"}
            ]
        )
    )
    # print(response)
    translated_text = response.choices[0].message.content.strip()
    return translated_text

In [8]:
async def translate_dataframe(df, column_name):
    tasks = [translate_text(text) for text in df[column_name]]
    translated_texts = await asyncio.gather(*tasks)
    df['translated_text'] = translated_texts

In [9]:
loop = asyncio.get_event_loop()
loop.run_until_complete(translate_dataframe(df, 'text'))

In [10]:
df

Unnamed: 0,title,date,link,text,translated_text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...,"**Date and Time:** Saturday, June 1, 19:00-22:..."
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...,**2024 International Conference on Live Coding...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...,"Sona Zanheroth, one of the powerhouses of Inst..."
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...,**Date:** 5.31 8:30PM \n**Venue:** Yuyintang ...


In [11]:
es_client = elasticsearch.Elasticsearch(
    getpass.getpass("Host: "),
    api_key=getpass.getpass("API Key: "),
)

Host:  ········
API Key:  ········


In [12]:
embedding = OpenAIEmbeddings()
elastic_vector_search = ElasticsearchStore(
    index_name="vr_tour_data",
    es_connection=es_client,
    embedding=embedding,
)

In [13]:
translated_texts = df['translated_text'].tolist()
combined_text = "\n".join(translated_texts)
text_splitter = CharacterTextSplitter(chunk_size=1100, chunk_overlap=0)
docs = text_splitter.split_documents([Document(page_content=combined_text)])

In [14]:
elastic_vector_search.add_documents(docs)

['be8a3965-5c95-47b6-a6ac-58b425b16f6c',
 '4478e7eb-fe0f-4a0d-bbea-991dbe04f9d5',
 'b3f2eaec-dbe4-44fd-91f8-0d28631e1883',
 'edab9ed5-b717-4684-877b-3c03b79ef980',
 '6218b115-255f-4d5b-8dcc-136f7262b94e',
 '00e91d8c-3b9d-43b1-9363-c1695544f462',
 'f8079d6c-6e91-45b9-94fb-41717ac07c16',
 'a3ec903a-f546-4082-b5d6-7f20b5ab09ae',
 '15734700-47aa-456b-8567-b038ec5466c6',
 'c1565efe-b4d6-4f8a-bcf7-a3974cff23b2',
 '0dff5fa0-09e3-4176-8e42-d9e7bb2249b9']

In [15]:
query = "Tell me about the International Conference on Live Coding?"

In [16]:
results = elastic_vector_search.similarity_search(query)

In [17]:
print(results)



In [18]:
query_system = "What's happening at System?"

In [19]:
system_results = elastic_vector_search.similarity_search(query_system, k=10)

In [20]:
print(system_results)

