# Fetch tweets from X and save them to graphRAG. Then talk with the data.

In [1]:
# import libraries
import dotenv
import os
import json

In [3]:
import requests
import datetime

In [5]:
from openai import OpenAI

In [7]:
from langchain_community.graphs import Neo4jGraph

In [9]:
client = OpenAI()

In [11]:
# load environment variables
load_status = dotenv.load_dotenv(".env")
if load_status is False:
    raise RuntimeError('Environment variables are not found.')

In [13]:
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
BEARER_TOKEN = os.getenv("BEARER_TOKEN")

In [15]:
print(NEO4J_URI)
print(NEO4J_USERNAME)
print(NEO4J_PASSWORD)
print(NEO4J_DATABASE)

neo4j://localhost:7687
neo4j
neo4j123
Neo4j


In [17]:
VECTOR_INDEX_NAME = 'tweets'

In [21]:
# connect to graph database
graph = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [21]:
graph.query("""
  SHOW VECTOR INDEXES
  """
)

[]

In [23]:
# create header for X service
def create_headers(bearer_token):
  return {
    "Authorization": "Bearer {}".format(bearer_token)
  }

In [25]:
# get tweets from X using start time and end time
def get_tweets(query, max_results=10, start_time=None, end_time=None):
  url = "https://api.twitter.com/2/tweets/search/recent"
  headers = create_headers(BEARER_TOKEN)
  params = {
    "query": query,
    "max_results": max_results,
    "tweet.fields": "created_at,author_id,text"
  }
  if start_time:
    params["start_time"] = start_time
  if end_time:
    params["end_time"] = end_time
  response = requests.get(url, headers=headers, params=params)
  if response.status_code != 200:
    raise Exception(response.status_code, response.text)
  return response.json()

In [27]:
# clean tweet and remove commercial tweets
def clean_tweet(tweet):
    messages = []
    system_prompt = "You assist user to clean the tweet and produce a json output. Do not output anything other than json. Do not use markup language."
    messages.append(
                    {
                        "role": "system", 
                        "content": system_prompt
                    })
    prompt_template = """The tweet is the following: {tweet}  Can you summarize the tweet in English and remove all the extra characters, emojis and links. If the tweet is promoting or selling something, return an empty string. Set "text=" to the sanitized tweet. If "text" contains predictions, set "category"="Yes", otherwise set "category"="No" Returns a json with keyword text and category and values ​​"text" and "category" that can be read by json.load()"""
    messages.append(
                    {
                        "role": "user", 
                        "content": prompt_template.format(tweet=tweet)
                    })
    # Openai response generation
    response = client.chat.completions.create(
                #model = "gpt-4o-2024-05-13",
                model = "gpt-4o-mini",
                messages = messages,
                temperature = 0.1,
                max_tokens = 2048,
                top_p = 1.0)
    return response.choices[0].message.content

In [29]:
# create tweet to graph database
create_tweets = """
MERGE (chunk:Tweet {id: $tweet.id})
ON CREATE SET 
    chunk.text = $tweet.text,
    chunk.created_at = $tweet.created_at,
    chunk.author_id = $tweet.author_id
RETURN chunk
"""

In [31]:
graph.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Tweet) REQUIRE c.id IS UNIQUE
""")

[]

In [33]:
# create a node fow a query word and link it to tweets
def create_link_to_queryword(queryword, tweetid):
    # Check if the query node exists
    queryword_query = "MATCH (querynode:Queryword {name: $name}) RETURN querynode"
    querynodes = graph.query(queryword_query, params={"name": queryword})

    if not querynodes:
        print("creating query node")
        create_query_node = """
        MERGE (querynode:Queryword {name: $name})
        RETURN querynode
        """
        graph.query(create_query_node, params={"name": queryword})

    # Create the QUERY_RESULT relationship between tweet and queryword
    create_query_relationship_query = """
    MATCH (querynode:Queryword {name: $name}), (tweet:Tweet {id: $tweetid})
    MERGE (querynode)-[:QUERY_RESULT]->(tweet)
    """
    graph.query(create_query_relationship_query, params={"name": queryword, "tweetid": tweetid})


In [35]:
# populate graph database 
def populate_database(start_time, end_time):
  # Change the query to search for tweets from Nvidia within the last week
  queryword = "nvidia"
  query = queryword + " -is:retweet -is:reply -url:links"
  max_results = 100
  tweets = get_tweets(query, max_results, start_time=start_time, end_time=end_time)
  count = 1
  for tweet in tweets["data"]:
    print(count)
    #print(f"Tweet: {tweet['text']}")
    print(f"Tweet id: {tweet['id']}")
    print(f"Created at: {tweet['created_at']}")
    #print(f"Author ID: {tweet['author_id']}")
    graph.query(create_tweets, params={'tweet': tweet})
    create_link_to_queryword(queryword, tweet['id'])
    count = count +1

    print("-" * 20)

In [61]:
# loop that fetches tweets from one day
start_time = datetime.datetime.strptime("2024-08-28T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")
previous_time = "2024-08-28T00:00:00Z"
for i in range(144):
  timestamp = start_time + datetime.timedelta(minutes=10 * (i+1))
  timestamp_str = timestamp.strftime("%Y-%m-%dT%H:%M:%SZ")
  print(timestamp_str)
  populate_database(previous_time,timestamp_str)
  previous_time = timestamp_str

2024-08-28T23:30:00Z
1
Tweet id: 1828938096001405285
Created at: 2024-08-28T23:29:56.000Z
--------------------
2
Tweet id: 1828938092574806221
Created at: 2024-08-28T23:29:55.000Z
--------------------
3
Tweet id: 1828938088544121120
Created at: 2024-08-28T23:29:54.000Z
--------------------
4
Tweet id: 1828938088032165914
Created at: 2024-08-28T23:29:54.000Z
--------------------
5
Tweet id: 1828938085985325526
Created at: 2024-08-28T23:29:54.000Z
--------------------
6
Tweet id: 1828938070428688418
Created at: 2024-08-28T23:29:50.000Z
--------------------
7
Tweet id: 1828938044294013141
Created at: 2024-08-28T23:29:44.000Z
--------------------
8
Tweet id: 1828938037830791381
Created at: 2024-08-28T23:29:42.000Z
--------------------
9
Tweet id: 1828938020877414693
Created at: 2024-08-28T23:29:38.000Z
--------------------
10
Tweet id: 1828938020093104488
Created at: 2024-08-28T23:29:38.000Z
--------------------
11
Tweet id: 1828938018117341515
Created at: 2024-08-28T23:29:38.000Z
--------

In [37]:
# make text embedding for cleaned tweet
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


In [39]:
# create author and relationships between author and tweet
def create_author_and_tweet_relationships(authorid, tweetid):
    # Create the relationship between Tweet and Author
    create_relationship_query = """
    MATCH (tweet:Tweet {id: $tweetid}), (author:Author {id: $authorid})
    MERGE (author)-[:POSTED]->(tweet)
    """
    graph.query(create_relationship_query, params={"tweetid": tweetid, "authorid": authorid})

    # Create the BELONGS_TO relationship between Tweet and Author
    create_belongs_to_relationship_query = """
    MATCH (tweet:Tweet {id: $tweetid}), (author:Author {id: $authorid})
    MERGE (tweet)-[:BELONGS_TO]->(author)
    """
    graph.query(create_belongs_to_relationship_query, params={"tweetid": tweetid, "authorid": authorid})
    

In [43]:
# fetch tweets from the graph database and clean them
def clean_text_and_embedd(number_of_rec):
    # Fetch chunks where the tweets are not cleaned
    query = "MATCH (chunk:Tweet) WHERE chunk.cleantext IS NULL RETURN chunk"
    chunks = graph.query(query)

    count = 1
    for record in chunks:
    #    print(count)
        chunk = record['chunk']
        tweetid = chunk['id']
        text = chunk['text']
        print("Original tweet: ")
        print("")
        print(text)
        authorid = chunk['author_id']
        cleaned_tweet = clean_tweet(text)
        print("")
        print("Tweet after cleaning:")
        print("")
        print(cleaned_tweet)
        cleantweet = json.loads(cleaned_tweet)
        cleantext = cleantweet.get("text")
        category = cleantweet.get("category")
        vector = ""
        if cleantext != "":
            vector = get_embedding(cleantext)

        # Update the chunk with the encoded vector
        update_query = """
        MATCH (chunk:Tweet {id: $tweetid})
        SET chunk.cleantext = $cleantext
        SET chunk.category = $category
        SET chunk.textEmbedding = $vector
        """
        graph.query(update_query, params={"tweetid": tweetid, "cleantext": cleantext, "category": category, "vector": vector})

        create_author(authorid)
        create_author_and_tweet_relationships(authorid, tweetid)
        if count == number_of_rec:
            break;
        count = count +1 
        print("-" * 20)


In [63]:
number_of_rec = 6000
clean_text_and_embedd(number_of_rec)

In [45]:
# create author to graph database
create_author = """
MERGE (chunk:Author {id: $chunkParam.id})
ON CREATE SET 
    chunk.name = $chunkParam.name,
    chunk.username = $chunkParam.username,
RETURN chunk
"""

In [77]:
graph.query("""
CREATE CONSTRAINT unique_author IF NOT EXISTS 
    FOR (c:Author) REQUIRE c.id IS UNIQUE
""")

[]

In [47]:
# find a user data from X using the user id
def get_user_by_id(user_id):
  url = f"https://api.twitter.com/2/users/{user_id}"
  headers = create_headers(BEARER_TOKEN)
  response = requests.get(url, headers=headers)
  if response.status_code != 200:
    raise Exception(response.status_code, response.text)
  return response.json()

In [49]:
# create author to graph database
def create_author(authorid):

  #  print("creating author: ",authorid)
    author_query = "MATCH (author:Author {id: $authorid}) RETURN author"
    authors = graph.query(author_query, params={"authorid": authorid})

    if not authors:
        user_data = get_user_by_id(authorid)
   #     print(user_data)
        create_author_query = """
        MERGE (author:Author {id: $authorid})
        ON CREATE SET 
            author.name = $name,
            author.username = $username
        RETURN author
        """
            
        graph.query(create_author_query, params={"authorid": authorid, "name": user_data['data']['name'], "username": user_data['data']['username']})

        parent_query = "MATCH (parentnode:X) RETURN parentnode"
        parentnodes = graph.query(parent_query)
        if not parentnodes:
            print("creating author's parent")
            create_parent_node = """
            MERGE (parentnode:X {name: $name})
            RETURN parentnode
            """
            graph.query(create_parent_node, params={"name": "X"})

        # Create the USER_OF relationship between Author and X
        create_user_relationship_query = """
        MATCH (parent:X {name: $name}), (author:Author {id: $authorid})
        MERGE (author)-[:USER_OF]->(parent)
        """
        graph.query(create_user_relationship_query, params={"name":"X", "authorid": authorid})

In [243]:
# create node X if not yet existing in graph databse
parent_query = "MATCH (parentnode:X) RETURN parentnode"
parentnodes = graph.query(parent_query)
if not parentnodes:
    print("creating author's parent")
    create_parent_node = """
    MERGE (parentnode:X {name: $name})
    RETURN parentnode
    """
    graph.query(create_parent_node, params={"name": "X"})

In [None]:
# create vector index
graph.query("""
         CREATE VECTOR INDEX `tweets` IF NOT EXISTS
          FOR (t:Tweet) ON (t.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

In [159]:
# make a search to graph database
def neo4j_vector_search(question, start_time, end_time, category):

    # Encode the question to get the embedding
    question_embedding = get_embedding(question)

    # Perform the vector search using the encoded embedding
    vector_search_query = """
    CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) YIELD node, score
    WHERE node.created_at >= $start_time AND node.created_at <= $end_time
    AND node.category IN $category
    RETURN score, node.created_at AS date, node.text AS text
    """
    similar = graph.query(vector_search_query, 
                       params={
                        'question_embedding': question_embedding, 
                        'index_name': VECTOR_INDEX_NAME, 
                        'top_k': 100,
                        'start_time': start_time,
                        'end_time': end_time,
                        'category': category
                       })
    return similar

In [153]:
# create LLM answer to question
def create_answer(question, start_time, end_time, category):

    # first find the best answers from graph database
    search_results = neo4j_vector_search(question, start_time, end_time, category)
 

    messages = []
    system_instructions = """You assist user to create summaries based on the CONTEXT: documents. Use vector index scores, 'score' in json, to find the most relevant answer to user question. If you do not know, answer 'I don't know.'""" 

    messages.append(
                    {
                        "role": "system", 
                        "content": system_instructions
                    })

    prompt_template = """{question} \n\nCONTEXT: {search_results}"""
    messages.append(
                    {
                        "role": "user", 
                        "content": prompt_template.format(question=question, search_results=search_results)
                    })

    # Openai response generation
    response = client.chat.completions.create(
                model = "gpt-4o-2024-05-13",
                messages = messages,
                temperature = 0.1,
                max_tokens = 2048,
                top_p = 1.0)
    return response.choices[0].message.content

# Usage

In [165]:
start_time = "2024-08-28T00:00:00Z"
end_time = "2024-08-28T21:00:00Z"

In [177]:
category=["Yes", "No"]

In [179]:
create_answer("What are Nvidia's biggest concerns? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

"Nvidia's biggest concerns revolve around several key areas that are frequently mentioned in the context of their business and market performance. One of the primary concerns is the high expectations surrounding their earnings reports. The anticipation and scrutiny from investors and analysts create significant pressure on Nvidia to consistently deliver strong financial results. This is evident from the numerous mentions of their earnings and the market's reaction to their financial performance. The company's ability to meet or exceed these expectations is crucial, as any disappointment could lead to a sharp decline in their stock price and investor confidence.\n\nAnother major concern for Nvidia is the competitive landscape in the technology and semiconductor industry. The rapid advancements in artificial intelligence (AI) and other cutting-edge technologies mean that Nvidia must continuously innovate to maintain its market leadership. The demand for AI chips and other advanced techno

In [181]:
category=["Yes"]

In [183]:
create_answer("What are Nvidia's biggest concerns? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

"Nvidia's biggest concerns revolve primarily around meeting the high market expectations set by the exponential growth in demand for AI and advanced technology chips. The market has placed significant pressure on Nvidia to deliver strong financial results, and any deviation from these expectations could lead to negative consequences. This is evident from the anticipation surrounding their earnings reports and the potential market reactions if Nvidia fails to meet these high expectations.\n\nAnother major concern for Nvidia is overcoming obstacles that could impact their earnings. Analysts and investors are closely watching for any signs of challenges that could hinder Nvidia's performance. These obstacles could range from supply chain issues to increased competition in the semiconductor industry. The focus on Nvidia's ability to navigate these challenges is critical, as any misstep could affect their market position and investor confidence.\n\nLastly, the global market's attention on N

In [185]:
category=["Yes", "No"]

In [187]:
create_answer("What does the CONTEXT: say, will Nvidia stock go up or down? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

'The CONTEXT does not provide a definitive answer on whether Nvidia\'s stock will go up or down. However, several recurring themes and sentiments can be observed. Many sources express anticipation and uncertainty regarding Nvidia\'s upcoming earnings report. There is a general consensus that the earnings report will have a significant impact on the stock market, with some predicting a potential for either a substantial rise or a notable decline in Nvidia\'s stock price. This uncertainty is reflected in the mixed expectations of analysts and investors.\n\nAnother common theme is the importance of Nvidia\'s earnings report not just for the company itself, but for the broader market. Many comments suggest that Nvidia\'s performance could influence the overall direction of the stock market, particularly in the tech sector. This is due to Nvidia\'s significant role in the market and its recent trends in AI and semiconductor technology. The anticipation is so high that some sources describe 

In [189]:
category=["Yes"]

In [191]:
create_answer("What does the CONTEXT: say, will Nvidia stock go up or down? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

'The CONTEXT reveals a significant amount of speculation and anticipation surrounding Nvidia\'s stock performance, particularly in relation to its upcoming earnings report. Many sources suggest that Nvidia\'s earnings will have a substantial impact on the stock market, with some predicting a potential for record share prices. However, there is also a notable amount of uncertainty, with opinions divided on whether the stock will rise or fall. This uncertainty is reflected in the sentiment that Nvidia\'s performance could either lead to a market rally or a significant downturn.\n\nSeveral sources highlight the importance of Nvidia\'s earnings report, noting that it could be a "make or break" moment for the market. The anticipation is not just about the earnings themselves but also about the guidance and future outlook that Nvidia will provide. Some analysts and traders are preparing for significant volatility, with expectations ranging from a 50% increase to a 30% decrease in stock price

In [193]:
start_time = "2024-08-28T21:00:00Z"
end_time = "2024-08-28T23:59:00Z"

In [195]:
category=["Yes", "No"]

In [197]:
create_answer("What was the impact of earning result? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

'The recent earnings report from NVIDIA has had a significant impact on the market, with mixed reactions observed across different sectors. Despite the positive earnings results, which exceeded expectations, the overall market response was paradoxical. For instance, while NVIDIA\'s performance was strong, the Dow Jones Industrial Average experienced a decline. This contradictory reaction highlights the complexity of market dynamics and suggests that investor expectations and broader economic factors play crucial roles in shaping market responses to earnings reports.\n\nSeveral commentators noted that despite the strong earnings, NVIDIA\'s stock price did not see the anticipated rise. This phenomenon has been observed in previous earnings reports as well, where positive results did not translate into immediate stock price increases. Some analysts suggest that this could be due to the high expectations already priced into the stock, leading to a "sell the news" scenario where investors t

In [199]:
category=["Yes"]

In [201]:
create_answer("What was the impact of earning result? Collect things that repeat in CONTEXT. Write 3 paragraphs.",start_time, end_time, category)

"The recent earnings report from NVIDIA had a notable impact on the financial markets, showcasing a paradox where the company's positive earnings did not translate into an immediate rise in the stock market. Despite the favorable earnings report, the Dow Jones Industrial Average experienced a decline. This unexpected reaction highlights the complexity of market dynamics and suggests that investors may have had other concerns or were engaging in profit-taking activities. The situation presents a challenge for strategists and analysts who must interpret these mixed signals to guide their investment decisions.\n\nIn the context of the Japanese stock market, the reaction to NVIDIA's earnings report is anticipated to be cautious. The Tokyo Stock Exchange is expected to exhibit a weak trend following the initial decline. This cautious sentiment reflects the interconnectedness of global markets and the influence of major tech companies like NVIDIA on investor behavior worldwide. Analysts and 