In [1]:
import pandas as pd
import asyncio
from langchain_core.documents import Document
import openai
import nest_asyncio
from langchain.text_splitter import CharacterTextSplitter

In [2]:
from langchain_elasticsearch import ElasticsearchStore
from langchain_openai import OpenAIEmbeddings
import elasticsearch
import getpass
from functools import partial
from datasets import load_dataset

In [3]:
nest_asyncio.apply()

In [4]:
ds = pd.read_csv('/Users/jessgarson/Downloads/tour_data.csv')

In [5]:
ds

Unnamed: 0,title,date,link,text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...


In [6]:
async def translate_text(text):
    document = Document(page_content=text)
    loop = asyncio.get_event_loop()
    response = await loop.run_in_executor(
        None,
        partial(
            openai.chat.completions.create,
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a translator."},
                {
                    "role": "user",
                    "content": f"Translate the following text to English: {document.page_content}",
                },
            ],
        ),
    )
    translated_text = response.choices[0].message.content.strip()
    return translated_text

In [7]:
async def translate_dataframe(ds, column_name):
    tasks_column = [translate_text(text) for text in ds[column_name]]
    translated_texts = await asyncio.gather(*tasks_column)
    ds["translated_" + column_name] = translated_texts
    return ds

In [8]:
loop = asyncio.get_event_loop()
loop.run_until_complete(translate_dataframe(ds, "text"))

Unnamed: 0,title,date,link,text,translated_text
0,算法锐舞,5/20,https://mp.weixin.qq.com/s/3kbLFsL3-bEEc6XCd_fSsw,时间：6月1日周六，19:00-22:30\n地点：SYSTEM系统（淮海中路1327号云海...,"Time: Saturday, June 1, 19:00-22:30 \nLocatio..."
1,2024国际即兴编程大会,5/13,https://mp.weixin.qq.com/s/lFGIMV-BEyND_9zb-z3buA,2024国际即兴编程大会\n2024年5月30日 - 2024年6月1日\n中国上海·上海纽...,2024 International Conference on Live Coding ...
2,关于本周五晚上活动的Visceral Realists,5/26,https://mp.weixin.qq.com/s/mbwzNNSYbzsu-Pj8fsSFIQ,\n佐那·赞合罗斯，本能现实主义的生命力来源之一，一名多媒体艺术家，常驻纽约的声音制造组合“...,Zona Zangerose is one of the sources of vitali...
3,这周末 (5.31-6.02),5.31,https://mp.weixin.qq.com/s/6vXpcunONsfrV6vxvwuBAQ,Date 时间: 5.31 8:30PM\nVenue 地点: Yuyintang 育音堂\...,Date: 5.31 8:30PM \nVenue: Yuyintang \nLineu...


In [9]:
es_client = elasticsearch.Elasticsearch(
    getpass.getpass("Host: "),
    api_key=getpass.getpass("API Key: "),
)

Host:  ········
API Key:  ········


In [10]:
index_name = "vr-tour-data-upload"

if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)
    print(f"Deleted existing index: {index_name}")

es_client.indices.create(index=index_name)
print(f"Created new index: {index_name}")

Deleted existing index: vr-tour-data-upload
Created new index: vr-tour-data-upload


In [11]:
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

In [12]:
elastic_vector_search = ElasticsearchStore(
    index_name="vr-tour-data-upload",
    es_connection=es_client,
    embedding=embedding,
)

In [13]:
translated_texts = ds["translated_text"].tolist()
combined_text = "\n".join(translated_texts)
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=50)
docs = text_splitter.split_documents([Document(page_content=combined_text)])

In [14]:
elastic_vector_search.add_documents(docs)

['75b3a191-7956-4990-ad33-4eec1b650944',
 'cc17836b-d0a2-44ba-bf00-44c592dc0cb5',
 'b694c205-529c-4da4-956e-2ff4c9762274',
 '5c506031-e733-4769-a7ee-f5e642a4e724',
 'e50ae4af-0ece-4bb5-993f-d71b2c504738',
 'cdc61f66-5c6d-4213-9547-4478f8e1ab60',
 '9f1f9358-0545-4118-9a00-4988eeeb4245']