In [3]:
from twspace_dl.api import API
from twspace_dl.cookies import load_cookies
from twspace_dl.twspace import Twspace
from twspace_dl.twspace_dl import TwspaceDL
import json
import time
import random
import os
import subprocess
import openai
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
def download_twitter_space_direct(space_url, cookie_file, output_format = "/tmp/spaces/%(creator_name)s-%(creator_id)s-%(title)s-%(id)s"):
    API.init_apis(load_cookies(cookie_file))
    twspace = Twspace.from_space_url(space_url)
    
    # Initialize TwspaceDL with a specific output format
    twspace_dl = TwspaceDL(twspace, output_format)
    
    # This will now use the custom output format you provided
    file_save_path = twspace_dl.filename + ".m4a"
    
    try:
        twspace_dl.download()
        twspace_dl.embed_cover()
    except KeyboardInterrupt:
        print("Download Interrupted by user")
    finally:
        twspace_dl.cleanup()
    
    if os.path.exists(file_save_path):
        return file_save_path
    else:
        return None


In [None]:
def monitor_twitter_spaces(user_ids, cookies_path, interval=10, variance=5):
    while True:
        for user_id in user_ids:
            log_prefix = f"[{time.strftime('%m/%d/%y %H:%M:%S')}] [tw_space@{user_id}] "
            
            print(f"{log_prefix} [VRB] Start trying with cookies...")
            space_url = f"https://twitter.com/{user_id}"
            download_twitter_space_direct(space_url, cookies_path)

            sleep_time = interval + random.randint(-variance, variance)
            print(f"{log_prefix} [VRB] Sleep {sleep_time} sec.")
            time.sleep(sleep_time)

In [6]:
download_twitter_space_direct("https://twitter.com/i/spaces/1mrxmyoNBmWxy?s=20", "cookies.txt")

[hls @ 0x12961aeb0] Changing ID3 metadata in HLS audio elementary stream is not implemented. Update your FFmpeg version to the newest one from Git. If the problem still occurs, it means that your file has a feature which has not been implemented.
size=   19968kB time=00:29:15.17 bitrate=  93.2kbits/s speed=11.3x    

Download Interrupted by user


In [7]:
def chunk_file_if_needed(file_path, max_size_mb=25):
    # Check the file size
    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

    if file_size_mb <= max_size_mb:
        # If size is below threshold, return the original path in a list
        return [file_path]
    else:
        # If file needs chunking
        # Calculate segment time (as an estimate for now)
        # This is a basic approach. Ideally, we might want to adjust based on file bitrate
        duration = get_audio_duration(file_path)
        estimated_segment_time = int(duration * (max_size_mb / file_size_mb))

        # Create the directory to store segments
        base_name = os.path.basename(file_path).rsplit('.', 1)[0]
        segments_dir = f"/tmp/spaces/{base_name}/segments"
        os.makedirs(segments_dir, exist_ok=True)

        # Split the file into chunks
        os.system(f"ffmpeg -i {file_path} -f segment -segment_time {estimated_segment_time} -c copy {segments_dir}/segment%09d.mp3")

        # Return the list of chunk file paths
        return [os.path.join(segments_dir, f) for f in os.listdir(segments_dir) if f.startswith("segment")]

def get_audio_duration(file_path):
    # Using ffprobe to get the duration of the audio
    cmd = f"ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 {file_path}"
    result = subprocess.check_output(cmd, shell=True)
    return float(result)

In [36]:
def transcribe_segments(segments, prompt):
    """
    Transcribe the audio segments using OpenAI's Whisper API.

    :param segments: List of paths to audio segment files.
    :param prompt: A prompt to provide context for the transcription.
    :return: A dictionary with segment paths as keys and their transcriptions as values.
    """

    transcript = ""
    for segment in segments:
        with open(segment, "rb") as audio_file:
            res = openai.Audio.transcribe("whisper-1", audio_file)
            transcript += str(res['text'])

    return transcript


In [48]:
from langchain.prompts import PromptTemplate

prompt_template = """
    You are an analytics professional at Lido Finance, a Liquid Staking protocol for Ethereum. You are given a transcript of Twitter Spaces in the crypto/web3 space that may or may not be related to Lido.
    Given the transcript, you are writing structured notes in markdown format. Think of your notes as key takeaways, TLDRs, and executive summaries.

    Your notes should be concise, detailed, and structured by topics. You know what information is especially important, and what is less important.
    
    Here is the transcript:
    {text}
    
    YOUR NOTES:"""

refine_template = """
    You are an analytics professional at Lido Finance, a Liquid Staking protocol for Ethereum. You are given a transcript of Twitter Spaces in the crypto/web3 space that may or may not be related to Lido.
    Given the transcript, you are refining structured notes in markdown format. Think of your notes as key takeaways, TLDRs, and executive summaries.

    Here is the existing note:
    {existing_answer}
    
    We have the opportunity to refine the existing note (only if needed) with some more context below:
    -----
    {text}
    -----
    
    Given the new context, refine the original note to make it more complete.
    If the context isn't useful, return the original summary.

    Your notes should be concise, detailed, and structured by topics. You know what information is especially important, and what is less important.

    Use markdown formatting to its fullest to produce visually appealing, structured notes.
    """

prompt = PromptTemplate.from_template(prompt_template)
refine_prompt = PromptTemplate.from_template(refine_template)

In [49]:
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

def summarize_transcript(transcript):
    doc = Document(page_content=transcript)

    # Summarize the document
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 40000, chunk_overlap = 500, length_function = len, is_separator_regex=False)
    docs = text_splitter.split_documents([doc])

    chain_type = "stuff" if len(docs) == 1 else "refine"
    chain = load_summarize_chain(ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k"), chain_type=chain_type, question_prompt=prompt, refine_prompt=refine_prompt)

    res = chain.run(docs)

    return res

In [34]:
transcript_location = download_twitter_space_direct("https://twitter.com/i/spaces/1OdKrjPEbLzKX?s=20", "cookies.txt")
chunks = chunk_file_if_needed(transcript_location)

[hls @ 0x15611e850] Changing ID3 metadata in HLS audio elementary stream is not implemented. Update your FFmpeg version to the newest one from Git. If the problem still occurs, it means that your file has a feature which has not been implemented.
size=   18241kB time=00:26:09.79 bitrate=  95.2kbits/s speed=62.4x    


In [37]:
transcript = transcribe_segments(chunks, "Twitter Space about Crypto, Web3, Liquid Staking, and Lido Finance")

In [50]:
summary = summarize_transcript(transcript)

In [51]:
print(summary)

# Twitter Spaces Transcript Notes

## Introduction
- The Twitter Spaces session is co-hosted by Maverick Protocol and Lido Finance.
- The session aims to discuss building in the current market conditions and the connection between Maverick and Lido.
- The session features Ada Woo, a core contributor at Maverick, and Seraphim, a DeFi expansionist at Lido.

## Maverick Protocol Introduction
- Maverick Protocol is a DeFi infrastructure provider focused on increasing industry efficiency.
- Maverick AMM brings higher capital efficiency to DeFi, resulting in lower slippage and more value stability.
- Maverick DEX has ranked as a top three DEX on all chains by seven-day volume, supporting over 50% of the trading volume on L2 scaling solutions.
- Maverick Boosted Positions maximize incentive efficiency for protocols looking to bootstrap liquidity or defend the peg.
- Maverick AMM offers liquidity movement modes, including bullish, bearish, and both, allowing LPs to deploy strategies based on t