### Tree Building

In [1]:
with open('../Data/Investor Reports/2011.md', 'r', encoding='utf-8') as file:
    content_11 = file.read()

with open('../Data/Investor Reports/2012-10.md', 'r', encoding='utf-8') as file:
    content_1210 = file.read()

with open('../Data/Investor Reports/2013-04.md', 'r', encoding='utf-8') as file:
    content_1304 = file.read()

In [2]:
import re

def count_words(s):
    words = re.findall(r'\b\w+\b', s)
    return len(words)

node="""
Not many new users this month (+700) but keep in mind growth is still organic as we have not done our PR push (very few people still know about us). In terms of early virality metrics, there were 300 virtual users added (i.e. recipients of Photo Mails who are not Everpix users yet).

We have almost 500 subscribed users now (80%+ on yearly subscriptions) out of about 4,000 eligible users. Subscriptions gross income for the past 2 months is as follow:
"""

count_words(node)

83

In [3]:
import re

def parse_markdown_to_tree(markdown):
    # Split the markdown into lines
    lines = markdown.strip().split('\n')
    
    tree = {}
    current_section = None
    current_subsection = None

    for line in lines:
        # Check for section headers (first level, "=")
        if re.match(r'^=+$', line.strip()):
            current_section = previous_line
            tree[current_section] = {}
        # Check for subsection headers (second level, "-")
        elif re.match(r'^-+$', line.strip()):
            if current_subsection != None:
                tree[current_section][current_subsection].pop()
            current_subsection = previous_line
            tree[current_section][current_subsection] = []
        else:
            # Normal content lines
            if current_subsection:
                tree[current_section][current_subsection].append(line)
            previous_line = line.strip()

    return tree

def merge_and_split_tree_with_punctuation(tree):
    def merge_nodes(nodes):
        merged_nodes = []
        temp_node = []
        for node in nodes:
            if node.startswith('  *'):
                temp_node.append(node)
            else:
                if temp_node:
                    merged_nodes.append('. '.join(temp_node) if not temp_node[-1].endswith('. ') else ''.join(temp_node))
                    merged_nodes.append('')
                    temp_node = []
                merged_nodes.append(node)
        if temp_node:
            merged_nodes.append('. '.join(temp_node) if not temp_node[-1].endswith('. ') else ''.join(temp_node))
            merged_nodes.append('')
        return merged_nodes

    def merge_between_empty(nodes):
        cleaned_content = []
        temp_content = []
        for node in nodes:
            if node == '':
                if temp_content:
                    merged_node = ' '.join(temp_content) if temp_content[-1].endswith('. ') else '. '.join(temp_content)
                    cleaned_content.append(merged_node)
                    temp_content = []
                cleaned_content.append(node)
            else:
                temp_content.append(node)
        if temp_content:
            cleaned_content.append(' '.join(temp_content) if temp_content[-1].endswith('. ') else '. '.join(temp_content))
        return cleaned_content

    def split_long_nodes(nodes, max_words=256):
        split_nodes = []
        for node in nodes:
            while count_words(node) > max_words:
                words = re.findall(r'\b\w+\b', node[:max_words])
                split_point = node[:max_words].rfind(words[-1]) + len(words[-1])
                split_nodes.append(node[:split_point].strip())
                node = node[split_point:].strip()
            split_nodes.append(node)
        return split_nodes

    for section, subsections in tree.items():
        for subsection, content in subsections.items():
            # Step 1: Merge '* ' and '  *' nodes
            merged_content = merge_nodes(content)
            # if subsection =='High Level': print(merged_content)
            # Step 2: Merge nodes between empty nodes
            cleaned_content = merge_between_empty(merged_content)
            # if subsection =='High Level': print(cleaned_content)
            # Step 3: Split long nodes
            final_content = split_long_nodes(cleaned_content)
            # if subsection =='High Level': print(final_content)
            tree[section][subsection] = final_content

    return tree



In [4]:
Tree1210=parse_markdown_to_tree(content_1210)
TreeIndex1210=merge_and_split_tree_with_punctuation(Tree1210)

In [5]:
a=Tree1210['Everpix October 2012 Report']
b=TreeIndex1210['Everpix October 2012 Report']
b

{'High Level': ['',
  '* Hired a full-time and on-site marketing / user growth consultant. * Actively searching for PR firm to handle our “real launch”. * Everpix 1.1 looking good for our “real launch”. * Windows Uploader just sent to first batch of external testers. * KPIs are defined and tracking started as of November 1st',
  ''],
 'Users and Subscriptions': ['',
  'Not many new users this month (+700) but keep in mind growth is still organic as we have not done our PR push (very few people still know about us). In terms of early virality metrics, there were 300 virtual users added (i.e. recipients of Photo Mails who are not Everpix users yet).',
  '',
  'We have almost 500 subscribed users now (80%+ on yearly subscriptions) out of about 4,000 eligible users. Subscriptions gross income for the past 2 months is as follow:',
  '',
  '* $6,121.88 in September. * $6,351.95 in October',
  '',
  'Again, keep in mind these numbers will not match what’s the Profit & Loss statement as our pa

In [6]:
Tree11=parse_markdown_to_tree(content_11)
TreeIndex11=merge_and_split_tree_with_punctuation(Tree11)
Tree1304=parse_markdown_to_tree(content_1304)
TreeIndex1304=merge_and_split_tree_with_punctuation(Tree1304)
Tree={**Tree11, **Tree1210, **Tree1304}

In [7]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
# print(embeddings)


  from tqdm.autonotebook import tqdm, trange


In [47]:
from Setting import openAIKey
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

llm = ChatOpenAI(api_key=openAIKey.Key, model='gpt-3.5-turbo', temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("system", "Make a summary of the input text. The summary should be under 240 words"),
    ("user", "{input}")
])

chain = prompt | llm | output_parser
input="""
Following TechCrunch Disrupt, users were allowed to register on the website and get on a waiting list. Everpix was quite early in its development at this time, and allowing anyone to create an account and import photos would not have been useful. Nevertheless, we had to start allowing some external users, if only to test the system, but also to follow up on a “public promise” made during Disrupt. Around mid-October, we started sending batches of invites from the waiting list and stopped around mid-November (since preparing for the Everpix Public Beta).

The primary goals for Everpix Private Alpha were:

1. Get a decent size set of testing users and their photos: this is required to simply test the infrastructure and software.
2. Demonstrate we can move a lot of photos from people’s computers seamlessly to the cloud: any service can import photos that are already online, so the difficulty is with offline photos.
3. Validate the bet that people would be willing to import *all* their photos to a third-party service: though other services, users have been “trained” to typically curate before importing, an assumption we need to break.

The full analytics as of December 31st are available in the attached document, but from a high-level perspective and during the 3 months period of Everpix Private Alpha, we have the following:

* The exact number is not available anymore but around 5,000 users registered on our waiting list and the vast majority were invited.
* 2,500 users created accounts and of those, 2,080 of them imported photos.
  This discrepancy could be explained by the fact some users realized they weren’t really interested or were using Windows for which we had no import solution.
* 11,100,000 photos were imported.
  67% from Mac computers (i.e offline photos) and the rest from online services.
* 732 users ran our Mac Uploader at least once.
  We didn't have an accurate way of measuring it, but an indirect estimate from looking at the update server logs shows that 500 to 600 users were running it daily.
* For users enjoying the full experience i.e. Mac users, the average number of photos imported was 10,000.

On the upside, we can say that goals #1, #2 and #3 were achieved - in the scope of a set of users mostly made of early adopters. We can’t guarantee such metrics will translate to mass market though. We also had 3 important features build and “market validated”:

* the Mac Uploader,
* duplicate photos detection independently of their resolution or compression artifacts,
* instant and seamless sharing from Everpix website or iPhone app of a user photo to Facebook, Twitter or email.

On the downside, we discovered some serious issues with our back end infrastructure built on top of Google App Engine:

* It wasn’t as responsive as it should have been and optimizing it was becoming difficult.
* We had some strange and very hard to track down data corruption creeping in.
* We realized it would be very hard to control costs going forward as can’t dive into the system and tune it precisely for our needs.

The difficulties with these 3 important issues were compounded by the fact Google App Engine is essentially a black box, which means a lot less maintenance, but also a lot more limitations to investigate and optimize things. Getting a Premium Account with extra support doesn’t alleviate these issues either.
"""
# chain.invoke({"input": input})

In [10]:
import qdrant_client
from qdrant_client.models import Distance, VectorParams, PointStruct

# 初始化Qdrant客户端
client = qdrant_client.QdrantClient("http://localhost:6333")  # Qdrant地址可能需要调整


# 第一个循环：处理第三层节点
summary_for_titles = {}
for report, titles in Tree.items():
    for title, text_chunks in titles.items():
        collection_name = (report + title).replace(" ", "_")
        
        # create collection
        try:
            if client.get_collection(collection_name=collection_name):
                client.delete_collection(collection_name=collection_name)
        except Exception:
            pass 

        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=384, distance=Distance.DOT),
        )

        for idx, text_chunk in enumerate(text_chunks):
            if text_chunk:
                embedding = model.encode(text_chunk)
                client.upsert(
                    collection_name=collection_name,
                    points=[
                        PointStruct(
                            id=idx, 
                            vector=embedding,
                            payload={"text": text_chunk}
                        )
                    ]
                )
        
        text_chunk_infos = ": ".join(text_chunks)
        summary_for_title=chain.invoke({"input": text_chunk_infos})
        summary_for_titles.update({collection_name:summary_for_title})
    


In [11]:
summary_for_titles

{'Everpix_End_of_Year_2011_ReportEverpix_Proof_of_Concept': 'The Everpix project was initiated in April 2011, gained momentum over the summer, and was officially unveiled at TechCrunch Disrupt San Francisco in early September 2011.',
 'Everpix_End_of_Year_2011_ReportEverpix_Private_Alpha': "Everpix started allowing external users to register after TechCrunch Disrupt, with the primary goals of testing the system, moving photos to the cloud seamlessly, and validating users' willingness to import all their photos. During the Private Alpha phase, around 5,000 users registered, with 2,500 creating accounts and 2,080 importing photos. The majority of photos were imported from Mac computers. The Mac Uploader was used by 732 users, with an estimated 500 to 600 daily users. Everpix achieved its goals with early adopters, but faced challenges with Google App Engine's responsiveness, data corruption, and cost control. Despite building important features like the Mac Uploader and duplicate photo d

In [48]:
# 第二个循环：处理第二层节点
summary_for_reports = {}
for report, titles in Tree.items():
    collection_name = report.replace(" ", "_")

    report_infos = []
    
    try:
            if client.get_collection(collection_name=collection_name):
                client.delete_collection(collection_name=collection_name)
    except Exception:
            pass 
    
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=384, distance=Distance.DOT),
    )
    for idx, (title, text_chunks) in enumerate(titles.items()):
        children_collection_name=(report + title).replace(" ", "_")
        report_info = title + ': ' + summary_for_titles[children_collection_name]
        embedding = model.encode(report_info)
        client.upsert(
            collection_name=collection_name,
            points=[
                PointStruct(
                    id=idx,  
                    vector=embedding,
                    payload={"text": report_info, "children_collection_name":children_collection_name}
                )
            ]
        )
        report_infos.append(report_info)
    
    report_infos = "; ".join(report_infos)
    summary_for_report=chain.invoke({"input": report_infos})
    print(summary_for_report)
    summary_for_reports.update({report:summary_for_report})


Everpix, a photo management project, started in 2011 with a Proof of Concept and later moved to a Private Alpha phase, facing challenges with Google App Engine. The project then transitioned to a Public Beta with a new back-end infrastructure, experiencing unexpected user growth and infrastructure scaling issues. The company focused on product development and testing, aiming to validate its vision through user feedback. Everpix underwent a major change in its back-end infrastructure, moving to Tornado & MySQL on Amazon Web Services. The company faced challenges in recruiting the necessary talent and managing finances, with expenses exceeding the budgeted amount. The next goals for Everpix include transitioning to an initial release, launching an Android client and Windows uploader, and introducing premium subscriptions. The company aims to secure a Series A funding round in the upcoming summer. Despite facing challenges, Everpix has seen success with its Private Alpha launch and is foc

In [49]:
import re

def extract_and_format_date(text):
    # 正则表达式匹配月份和年份
    pattern = r'(?:(\w+)\s+)?(\d{4})'
    
    # 定义月份映射
    months = {
        'January': '01', 'February': '02', 'March': '03', 'April': '04',
        'May': '05', 'June': '06', 'July': '07', 'August': '08',
        'September': '09', 'October': '10', 'November': '11', 'December': '12'
    }
    
    match = re.search(pattern, text)
    
    if match:
        month = match.group(1)
        year = match.group(2)
        
        if month:
            month = months.get(month, '00')  # 如果月份不在映射中，默认为00
        else:
            month = '00'
        
        return f"{year}{month}"
    
    return None  # 如果没有匹配到任何日期

# 测试函数
texts = [
    'Everpix End of Year 2011 Report',
    'Everpix January 2012 Report'
]

for text in texts:
    print(extract_and_format_date(text))


201100
201201


In [50]:
collection_name = 'EverpixInvestorReport'

try:
    if client.get_collection(collection_name=collection_name):
        client.delete_collection(collection_name=collection_name)
except Exception:
    pass 

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=384, distance=Distance.DOT),
)

for idx, (report, summary_for_report) in enumerate(summary_for_reports.items()):  
    children_collection_name=report.replace(" ", "_")
    summary_for_report = report + ': ' + summary_for_report
    embedding = model.encode(summary_for_report)
    client.upsert(
        collection_name=collection_name,
        points=[
            PointStruct(
                id=idx,  
                vector=embedding,
                payload={"text": summary_for_report, "time": extract_and_format_date(report), "children_collection_name":children_collection_name}
            )
        ]
    )

### Tree Retrival

In [51]:
query="How many new users are there in October, 2012?"

In [52]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Find the year and month from the input text, return it in YYYYMM format. If the month is not found, then return YYYY00"),
    ("user", "{input}")
])

get_time_chain = prompt | llm | output_parser
time=get_time_chain.invoke({"input":query})
time

'201210'

In [53]:
from qdrant_client import models

embedding = model.encode(query)
collection_name='EverpixInvestorReport'

result_report=client.search(
    collection_name=f"{collection_name}",
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="time",
                match=models.MatchValue(
                    value=time
                ),
            )
        ]
    ),
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)

In [54]:
collection_name_report=result_report[0].payload['children_collection_name']
print(collection_name_report)
report_summary=result_report[0].payload['text']
print(report_summary)

Everpix_October_2012_Report
Everpix October 2012 Report: Everpix is preparing for a "real launch" by hiring a marketing/user growth consultant and seeking a PR firm. The company has seen an increase in new users and subscriptions, generating revenue from subscriptions. They aim to reduce infrastructure costs and have identified opportunities for improvement. Despite challenges in finding a suitable candidate for user growth and marketing, they have hired a temporary contractor to kickstart these efforts. Key Performance Indicators have been defined, and tracking has begun. The company is finalizing product messaging and selecting a PR firm for the launch. They have introduced new features like searching photos by likeness and are releasing new iPhone and iPad apps. The Windows Uploader has been sent to beta testers, with plans for future improvements. The company is eager to launch but faces delays due to the PR firm selection process. Contingency plans are being developed to ensure a 

In [55]:
result_title=client.search(
    collection_name=f"{collection_name_report}",
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)

In [56]:
collection_name_title=result_title[0].payload['children_collection_name']
print(collection_name_title)
title_summary=result_title[0].payload['text']
print(title_summary)

Everpix_October_2012_ReportUsers_and_Subscriptions
Users and Subscriptions: The company has seen a slight increase in new users this month, with 300 virtual users added. They currently have almost 500 subscribed users, with 80% on yearly subscriptions. The subscriptions have generated $6,121.88 in September and $6,351.95 in October. Infrastructure costs for October were $8,438, lower than in September due to not running Everpix Beta. The company has identified opportunities to reduce infrastructure costs further in the future. It is noted that the numbers may not match the Profit & Loss statement due to payment processing delays and do not include iTunes Store subscriptions. The company has not yet done a PR push, so growth is still organic and many people are not aware of the service.


In [57]:
result_text_chunk=client.search(
    collection_name=f"{collection_name_title}",
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)

In [58]:
text_chunk=result_text_chunk[0].payload['text']
print(text_chunk)

Not many new users this month (+700) but keep in mind growth is still organic as we have not done our PR push (very few people still know about us). In terms of early virality metrics, there were 300 virtual users added (i.e. recipients of Photo Mails who are not Everpix users yet).


In [59]:
query="What's the progress of Everpix on Back End in October, 2012?"

In [60]:
time=get_time_chain.invoke({"input":query})

embedding = model.encode(query)
collection_name='EverpixInvestorReport'

result_report=client.search(
    collection_name=f"{collection_name}",
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="time",
                match=models.MatchValue(
                    value=time
                ),
            )
        ]
    ),
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)
print(result_report)
collection_name_report=result_report[0].payload['children_collection_name']
report_summary=result_report[0].payload['text']

result_title=client.search(
    collection_name=f"{collection_name_report}",
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)
print(result_title)
collection_name_title=result_title[0].payload['children_collection_name']
title_summary=result_title[0].payload['text']

result_text_chunk=client.search(
    collection_name=f"{collection_name_title}",
    search_params=models.SearchParams(hnsw_ef=128, exact=False),
    query_vector=embedding,
    limit=3,
)

text_chunk=result_text_chunk[0].payload['text']
print(text_chunk)

[ScoredPoint(id=1, version=1, score=0.60898745, payload={'children_collection_name': 'Everpix_October_2012_Report', 'text': 'Everpix October 2012 Report: Everpix is preparing for a "real launch" by hiring a marketing/user growth consultant and seeking a PR firm. The company has seen an increase in new users and subscriptions, generating revenue from subscriptions. They aim to reduce infrastructure costs and have identified opportunities for improvement. Despite challenges in finding a suitable candidate for user growth and marketing, they have hired a temporary contractor to kickstart these efforts. Key Performance Indicators have been defined, and tracking has begun. The company is finalizing product messaging and selecting a PR firm for the launch. They have introduced new features like searching photos by likeness and are releasing new iPhone and iPad apps. The Windows Uploader has been sent to beta testers, with plans for future improvements. The company is eager to launch but face

In [None]:
[
    ScoredPoint(
        id=1,
        version=1,
        score=0.60898745,
        payload={
            "children_collection_name": "Everpix_October_2012_Report",
            "text": 'Everpix October 2012 Report: Everpix is preparing for a "real launch" by hiring a marketing/user growth consultant and seeking a PR firm. The company has seen an increase in new users and subscriptions, generating revenue from subscriptions. They aim to reduce infrastructure costs and have identified opportunities for improvement. Despite challenges in finding a suitable candidate for user growth and marketing, they have hired a temporary contractor to kickstart these efforts. Key Performance Indicators have been defined, and tracking has begun. The company is finalizing product messaging and selecting a PR firm for the launch. They have introduced new features like searching photos by likeness and are releasing new iPhone and iPad apps. The Windows Uploader has been sent to beta testers, with plans for future improvements. The company is eager to launch but faces delays due to the PR firm selection process. Contingency plans are being developed to ensure a successful launch.',
            "time": "201210",
        },
        vector=None,
        shard_key=None,
        order_value=None,
    )
]
[
    ScoredPoint(
        id=0,
        version=0,
        score=0.656795,
        payload={
            "children_collection_name": "Everpix_October_2012_ReportHigh_Level",
            "text": 'High Level: The company has hired a full-time on-site marketing/user growth consultant and is actively looking for a PR firm for their upcoming "real launch." Everpix 1.1 is in good shape for the launch, and the Windows Uploader has been sent to the first batch of external testers. Key Performance Indicators (KPIs) have been defined, and tracking has begun as of November 1st.',
        },
        vector=None,
        shard_key=None,
        order_value=None,
    ),
    ScoredPoint(
        id=4,
        version=4,
        score=0.6349735,
        payload={
            "children_collection_name": "Everpix_October_2012_ReportReal_Launch_Update",
            "text": "Real Launch Update: The text discusses the readiness of Everpix for launch, highlighting the unique features that differentiate it from other platforms like Flickr or SmugMug. The main challenge faced currently is finalizing the product messaging and selecting a suitable PR firm for the launch. Despite considering multiple PR agencies, finding one that truly understands the needs of early-stage startups like Everpix has proven difficult. The company is eager to launch soon as they offer a comprehensive photo solution not available elsewhere. However, the lack of a chosen PR agency is delaying the decision on an official launch date. Contingency plans are being developed to ensure that the launch opportunity is not missed.",
        },
        vector=None,
        shard_key=None,
        order_value=None,
    ),
    ScoredPoint(
        id=2,
        version=2,
        score=0.51912594,
        payload={
            "children_collection_name": "Everpix_October_2012_ReportUser_Growth_&_Marketing",
            "text": "User Growth & Marketing: After an extensive 45-day search using multiple recruiters and online job boards, Everpix was unable to find an ideal candidate with the required experience in early-stage startups and the photo space industry for driving user growth and marketing. Despite reviewing over 100 resumes and conducting multiple on-site interviews, the search was unsuccessful. The company, which has been primarily focused on product building, has decided to shift its focus to user growth and marketing. Recognizing the need to make progress in this area, Everpix has opted to hire a full-time marketing/PR/user growth contractor on-site for a temporary period to kickstart the necessary work, including establishing product messaging and KPIs. This decision was made to avoid further delays in moving forward with their growth and marketing strategies.",
        },
        vector=None,
        shard_key=None,
        order_value=None,
    ),
]

In [None]:
"""* Hired a full-time and on-site marketing / user growth consultant. 
* Actively searching for PR firm to handle our “real launch”. 
* Everpix 1.1 looking good for our “real launch”. 
* Windows Uploader just sent to first batch of external testers. 
* KPIs are defined and tracking started as of November 1st
"""