In [None]:
# imports and constants
import openai
import json
import requests
import os   # 用于从环境变量中获取openai_api_key
from tqdm.notebook import tqdm  # for printing progress bars
import numpy as np
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField,
    NumericField
)
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from IPython.display import clear_output, display, Markdown
import time
from datetime import datetime, date
# from data_prepare import openai_api_key  #统一使用 os.getenv("OPENAI_API_KEY")来获取密钥

# 准备bing的key, newsapi的key
# search_api_key = SEARCH_API_KEY 
# search_endpoint="https://api.bing.microsoft.com/v7.0/news/search"
# mkt = 'zh-CN'
newsapi_key = os.getenv("NEWSAPI_KEY")    # 准备Newsapi的key
newsapi_endpoint = "https://newsapi.org/v2/everything"
# params = {
#     "q": "关键词",
#     "language": "zh",  # 或者其他语言代码
#     "apiKey": newsapi_key
# }
language = 'zh'

# openai.api_key = OPENAI_API_KEY
openai_api_key = os.getenv("OPENAI_API_KEY")
GPT_MODEL = "gpt-4"

# INDEX_NAME = "IsraelHamasNews"
INDEX_NAME = "Iran"
VECTOR_DIM = 1536 
DISTANCE_METRIC = "COSINE"                # distance metric for the vectors (ex. COSINE, IP, L2)



In [2]:
# Helper functions
def json_gpt(input: str):
    completion = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.5,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed

from typing import List
def embeddings(input: List[str]) -> List[List[str]]:
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return [data.embedding for data in response.data]

In [5]:
# QUERIES_INPUT = """
# You have access to a search API that returns recent news articles.
# The background is Israeli-Palestinian situation and recent Israeli-Hamas conflict began on Oct. 7, 2023.
# Generate an array of search queries that are relevant to this topic both in English and Simplified Chinese.
# Use a variation of related keywords for the queries, trying to be as general as possible.
# Include as many queries as you can think of, including and excluding terms.
# For example, include queries like ['keyword_1 keyword_2', 'keyword_1', 'keyword_2'].

# Format: {{"queries": ["query_1", "query_2", "query_3"]}}
# """

QUERIES_INPUT = """
You have access to a search API that returns recent news articles.
The background is Iran.
Generate an array of search queries that are relevant to this topic both in English and Simplified Chinese.
Use a variation of related keywords for the queries, trying to be as general as possible.
Include as many queries as you can think of, including and excluding terms.
For example, include queries like ['keyword_1 keyword_2', 'keyword_1', 'keyword_2'].

Format: {{"queries": ["query_1", "query_2", "query_3"]}}
"""

queries = json_gpt(QUERIES_INPUT)["queries"]

queries

['Israeli-Palestinian conflict 2023',
 'Israeli-Hamas conflict 2023',
 'Israel Palestine conflict news',
 'Israel Hamas conflict news',
 '2023 Israel Gaza conflict',
 'Israel Palestine situation October 2023',
 'Israeli-Hamas war 2023',
 'Israel and Palestine 2023',
 'Israel and Hamas 2023',
 'Israel Gaza war 2023',
 '2023 Middle East conflict',
 '2023 Middle East war',
 '以色列-巴勒斯坦冲突2023',
 '以色列-哈马斯冲突2023',
 '以色列 巴勒斯坦 冲突 新闻',
 '以色列 哈马斯 冲突 新闻',
 '2023年以色列加沙冲突',
 '以色列巴勒斯坦局势2023年10月',
 '以色列-哈马斯战争2023',
 '以色列和巴勒斯坦2023',
 '以色列和哈马斯2023',
 '以色列加沙战争2023',
 '2023年中东冲突',
 '2023年中东战争']

In [None]:

articles = []
# headers = {"Ocp-Apim-Subscription-Key": search_api_key}
headers = {"Ocp-Apim-Subscription-Key": newsapi_key}
clear_output(wait=True)
for query in tqdm(queries):
    # params = {"q": query, "mkt": mkt, "count": 20}
    params = {"q": query, "language": language, "count": 20}
    try:
        # response = requests.get(search_endpoint, headers=headers, params=params)
        response = requests.get(newsapi_endpoint, headers=headers, params=params)
        response.raise_for_status()
        search_results = response.json()
        articles = articles + search_results["value"]
        print(search_results["value"])
    except Exception as ex:
        raise ex

In [8]:
# remove duplicates
articles = list({article["url"]: article for article in articles}.values())

print("Total number of articles:", len(articles))

for article in articles:
    print(article["datePublished"], article["url"], article["description"])
    print()


Total number of articles: 327
2023-10-11T07:14:00.0000000Z https://themalaysianreserve.com/2023/10/11/israeli-palestinian-conflict-over-100-years-of-history/ The Israeli-Palestinian conflict, where Hamas is carrying out an unprecedented offensive in Israel, has its roots in the late 19th century.Jews fleeing anti-Semitism in Russia and central Europe began emigrating to Palestine.

2023-10-11T05:12:00.0000000Z https://www.sbs.com.au/news/article/the-israeli-palestinian-conflict-explained/kzuibqf33 The bloody 1948 Arab–Israeli War saw 700,000 Palestinians flee their homes - a mass exodus known as the 'Nakba', Arabic for 'catastrophe'. The war saw 700,000 Palestinians displaced in a mass exodus and Israel gained control of most of the disputed territory, with the exception of the West Bank and Gaza Strip.

2023-10-11T07:17:00.0000000Z https://www.radio.gov.pk/08-10-2023/world-reacts-to-ongoing-palestinian-israeli-conflict Many countries, including Turkiye, Oman, the UAE, Egypt, Brazil, R

In [10]:
# Create search index
r = Redis(host='localhost', port=6379, db=0) # Explicitly use the default parameters for clarity
# define RediSearch vector fields to use FLAT index
embedding = VectorField("embedding",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC
    }
)

# Define RediSearch fields for each of the columns in the dataset
# This is where you should add any additional metadata you want to capture
datePublished = TextField("datePublished")
timeStamp = NumericField("timeStamp")
url = TextField("url")
description = TextField("description")

feilds = [datePublished, timeStamp, url, description, embedding]
indexDefinition = IndexDefinition(prefix=[INDEX_NAME], index_type=IndexType.HASH)

try:
    r.ft(INDEX_NAME).dropindex(delete_documents=True)
except Exception as e:
    print(e)
try:
    r.ft(INDEX_NAME).create_index(fields=feilds, definition=indexDefinition)
except Exception as e:
    print(e)
print(r.ft(INDEX_NAME).info())

{'index_name': 'IsraelHamasNews', 'index_options': [], 'index_definition': [b'key_type', b'HASH', b'prefixes', [b'IsraelHamasNews'], b'default_score', b'1'], 'attributes': [[b'identifier', b'datePublished', b'attribute', b'datePublished', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'timeStamp', b'attribute', b'timeStamp', b'type', b'NUMERIC'], [b'identifier', b'url', b'attribute', b'url', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'description', b'attribute', b'description', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'embedding', b'attribute', b'embedding', b'type', b'VECTOR']], 'num_docs': '0', 'max_doc_id': '0', 'num_terms': '0', 'num_records': '0', 'inverted_sz_mb': '0', 'vector_index_sz_mb': '0.00818634033203125', 'total_inverted_index_blocks': '26042', 'offset_vectors_sz_mb': '0', 'doc_table_size_mb': '0', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0', 'geoshapes_sz_mb': '0', 'records_per_doc_avg': 'nan', 'bytes_per_record_avg': 'nan', 'offs

In [12]:
processed = 0
succeeded = 0
clear_output(wait=True)
for article in tqdm(articles):
    processed += 1
    if 'description' not in article:
        continue
    # Create embedding with GPT(ada)
    embedding = openai.Embedding.create(input=article["description"], model="text-embedding-ada-002")["data"][0]["embedding"]
    # Prepare embedding vector for RediSearch
    embedding = np.array(embedding).astype(np.float32).tobytes()
    date_obj = datetime.strptime(article["datePublished"][:-2], '%Y-%m-%dT%H:%M:%S.%f')
    timeStamp = time.mktime(date_obj.timetuple())
    # Add to RediSearch index
    key = f'{INDEX_NAME}:{article["url"]}'
    r.hset(key, mapping={'datePublished': article["datePublished"], 'timeStamp': timeStamp, 'url': article["url"], 'description': article["description"], 'embedding': embedding})
    succeeded += 1
print(processed, "->", succeeded)

  0%|          | 0/327 [00:00<?, ?it/s]

327 -> 327


In [3]:
r = Redis()
# indexes = r.execute_command('KEYS IsraelHamasNews:*')
indexes = r.execute_command('KEYS IranNews:*')
print(len(indexes))

327


## The following sections are used to test and debug the prepared dataset

In [62]:
# User asks a question
user_question = "test"

HA_INPUT = f"""
Generate a hypothetical answer to the user's question. This answer will be used to rank search results. 
Pretend you have all the information you need to answer, but don't use any actual facts. Instead, use placeholders
like NAME did SOMETHING, or NAME said SOMETHING at PLACE. 
Today is {date.today().strftime("%A, %B %d, %Y")}. You can decide whether to include the date in the hypothetical answer according to the user's question.

User question: {user_question}

Format: {{"hypotheticalAnswer": "hypothetical answer text"}}
"""

hypothetical_answer = json_gpt(HA_INPUT)["hypotheticalAnswer"]

hypothetical_answer

'NAME performed a test on SOMETHING at PLACE'

In [63]:
query_embedding = openai.Embedding.create(input=user_question, model="text-embedding-ada-002")["data"][0]["embedding"]
query_vec = np.array(query_embedding).astype(np.float32).tobytes()
# Prepare the query
query_base = (Query("*=>[KNN 20 @embedding $vec as score]").sort_by("timeStamp", asc=False).paging(0, 20).return_fields("score", "url", "datePublished", "description").dialect(2))
query_param = {"vec": query_vec}
query_results = r.ft(INDEX_NAME).search(query_base, query_param).docs
print(query_results)
formatted_result = ""
for query_result in query_results:
    formatted_result += f"URL: {query_result['url']}\nDate: {query_result['datePublished']}\nContent: {query_result['description']}\n\n"
print(formatted_result)

[Document {'id': 'IsraelHamasNews:https://www.sandiegouniontribune.com/news/nation-world/story/2023-10-11/ap-photos-protests-by-pro-israel-and-pro-palestinian-demonstrators-span-the-world-as-war-escalates', 'payload': None, 'score': '0.248413264751', 'url': 'https://www.sandiegouniontribune.com/news/nation-world/story/2023-10-11/ap-photos-protests-by-pro-israel-and-pro-palestinian-demonstrators-span-the-world-as-war-escalates', 'datePublished': '2023-10-11T08:27:00.0000000Z', 'description': 'From Bangladesh to Las Vegas and Brazil to Rome, demonstrations by supporters of Israel and the Palestinians were held around the world as people took to the streets to expresses their views and often outrage as the war escalated between Israel and Hamas militants. Demonstrators have taken to the streets of Rome, Barcelona, Brasilia ...'}, Document {'id': 'IsraelHamasNews:https://english.mathrubhumi.com/news/world/israel-strikes-in-sealed-off-gaza-as-war-appears-set-to-escalate-1.8977521', 'payload

In [64]:
ANSWER_INPUT = f"""
Write an answer in Chinese to the user's question based on the given search results. 
SEARCH_RESULTS: {formatted_result}
USER_QUESTION: {user_question}

Today is {date.today().strftime("%A, %B %d, %Y")}. You can use this date to filter the search results according to the user's question.
Include as much information as possible in the answer. List the reference search result URLs at the end of your answer.
"""

completion = openai.ChatCompletion.create(
    model=GPT_MODEL,
    messages=[{"role": "user", "content": ANSWER_INPUT}],
    temperature=0.5
)

display(Markdown(completion.choices[0].message.content))
print(completion.usage.prompt_tokens, completion.usage.completion_tokens)

对不起，你的问题“test”我不能理解，能否请你提供更多的信息？

2213 26


In [57]:
print(completion.usage.prompt_tokens, completion.usage.completion_tokens)
print(r.ft(INDEX_NAME).info())
print(len(query_results))

4758 331
{'index_name': 'IsraelHamasNews', 'index_options': [], 'index_definition': [b'key_type', b'HASH', b'prefixes', [b'IsraelHamasNews'], b'default_score', b'1'], 'attributes': [[b'identifier', b'datePublished', b'attribute', b'datePublished', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'timeStamp', b'attribute', b'timeStamp', b'type', b'NUMERIC'], [b'identifier', b'url', b'attribute', b'url', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'description', b'attribute', b'description', b'type', b'TEXT', b'WEIGHT', b'1'], [b'identifier', b'embedding', b'attribute', b'embedding', b'type', b'VECTOR']], 'num_docs': '327', 'max_doc_id': '327', 'num_terms': '3689', 'num_records': '11069', 'inverted_sz_mb': '0.062445640563964844', 'vector_index_sz_mb': '6.0223388671875', 'total_inverted_index_blocks': '30002', 'offset_vectors_sz_mb': '0.010914802551269531', 'doc_table_size_mb': '0.048697471618652344', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0.014215469360351563'