In [None]:
import os
import requests
import pandas as pd
import openai
import numpy as np
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.pdfgen import canvas
from openai.embeddings_utils import get_embedding, cosine_similarity
from scipy import spatial

class WebScraper:
    def __init__(self, api_key, cx, query, num_results):
        self.api_key = api_key
        self.cx = cx
        self.query = query
        self.num_results = num_results
        self.results = []

    def search_google(self):
        url = 'https://www.googleapis.com/customsearch/v1'
        params = {
            'key': self.api_key,
            'cx': self.cx,
            'q': self.query,
            'num': self.num_results,
        }
        response = requests.get(url, params=params)
        return response.json()

    def save_search_results_to_excel(self, filename='search_results.xlsx'):
        df = pd.DataFrame(self.results)
        df.to_excel(filename, index=False)

    def scrape_webpage(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()

    def save_scraped_data_to_file(self, index, content, file_format='xml'):
        with open(f"web_page_{index}.{file_format}", "w", encoding="utf-8") as f:
            f.write(content)

    def summarize(self, text, max_tokens = 4000):
        tokens = text.split()
        truncated_text = " ".join(tokens[:max_tokens])

        prompt = f"Please provide a brief summary of the following text:\n{truncated_text}"
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            max_tokens=300,
            n=1,
            stop=None,
            temperature=0.5,
        )
        return response.choices[0].text.strip()

    def split_and_summarize(self, text, num_parts=5):
        text_length = len(text)
        max_length = text_length // num_parts

        parts = [text[i:i + max_length] for i in range(0, text_length, max_length)]

        summaries = []

        for part in parts:
            summary = self.summarize(part)
            summaries.append(summary)

        aggregated = ' '.join(summaries)
        final_summary = self.summarize(aggregated)

        return final_summary

    def save_summaries_to_pdf(self, df, filename='summaries.pdf'):
        c = canvas.Canvas(filename, pagesize=letter)
        textobject = c.beginText()
        textobject.setTextOrigin(50, 750)

        for index, row in df.iterrows():
            with open(f"web_page_{index}.xml", "r", encoding="utf-8") as f:
                content = f.read()
            summary = self.summarize(content)

            textobject.setFont("Helvetica-Bold", 14)
            textobject.setFillColor(colors.red)
            textobject.textOut(f"Title: {row['title']}\n")
            textobject.setFont("Helvetica", 12)
            textobject.setFillColor(colors.black)
            textobject.textOut(f"Link: {row['link']}\n")
            textobject.textOut(f"Summary: {summary}\n\n")

            if textobject.getY() < 300:
                c.drawText(textobject)
                c.showPage()
                textobject = c.beginText()
                textobject.setTextOrigin(50, 750)
                c.drawText(textobject)
                c.showPage()
                c.save()

    def run(self):
        # Step 1
        data = self.search_google()
        if 'items' in data:
            self.results = [
                {
                    'title': item['title'],
                    'link': item['link'],
                    'snippet': item['snippet'],
                    'date': item.get('pagemap', {}).get('metatags', [{}])[0].get('og:updated_time')
                }
                for item in data['items']
            ]
            self.save_search_results_to_excel()
        else:
            print("No items in the response. Please check your query or Google Custom Search setup.")

        # Step 2
        df = pd.read_excel('search_results.xlsx')
        for index, row in df.iterrows():
            content = self.scrape_webpage(row['link'])
            summarized_content = self.split_and_summarize(content)
            self.save_scraped_data_to_file(index, summarized_content, 'txt')

        # Step 3
        self.save_summaries_to_pdf(df)


if __name__ == "__main__":
    api_key = 'AIzaSyCIGuLmSrM65sXknalTE4B4x8PsCZpAZ-I'
    cx = 'd4c72f123415549d9'
    query = 'ChatGPT use cases 2023'
    num_results = 5
    scraper = WebScraper(api_key, cx, query, num_results)
    scraper.run()


In [None]:
    # def scrape_webpage(self, url):
    #     response = requests.get(url)
    #     soup = BeautifulSoup(response.text, 'html.parser')
    #     return soup.get_text()

    # def save_scraped_data_to_file(self, index, content, file_format='xml'):
    #     with open(f"web_page_{index}.{file_format}", "w", encoding="utf-8") as f:
    #         f.write(content)
    
    # def scrape_and_save(self):
    #     def extract_content(url):
    #         response = requests.get(url)
    #         soup = BeautifulSoup(response.content, 'html.parser')
    #         return soup.prettify(formatter='xml')

    #     def save_content_to_pdf(content, filename):
    #         # Create a subfolder named 'pdfs' if it doesn't exist
    #         if not os.path.exists(f'{self.query}_pdfs'):
    #             os.makedirs(f'{self.query}_pdfs')

    #         # Save the file in the 'pdfs' subfolder
    #         with open(os.path.join(f'{self.query}_pdfs', filename), "w", encoding="utf-8") as f:
    #             f.write(content)

    #     # Read the Excel file
    #     df = pd.read_excel(f'{self.query}.xlsx')

    #     # Scrape web pages and save them in pdf format
    #     for index, row in df.iterrows():
    #         content = extract_content(row['link'])
    #         save_content_to_pdf(content, f"web_page_{index}.pdf")
    
    ## Using beautiful soup and pdfkit
    # def scrape_and_save(self):
    #     def extract_content(url):
    #         headers = {
    #             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    #         }
    #         response = requests.get(url, headers=headers)
    #         soup = BeautifulSoup(response.content, 'html.parser')
    #         return soup.prettify(formatter='html')

        
    #     path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'  # replace with your actual path
    #     config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

    #     def save_content_to_pdf(content, filename):
    #         # Create a subfolder named 'pdfs' if it doesn't exist
    #         if not os.path.exists(f'{self.query}_pdfs'):
    #             os.makedirs(f'{self.query}_pdfs')

    #         # # Save the file in the 'pdfs' subfolder
    #         # with open(os.path.join(f'{self.query}_pdfs', filename), "w", encoding="utf-8") as f:
    #         #     f.write(content)
            
    #          # Define options for pdfkit
    #         options = {
    #             'no-stop-slow-scripts': True,
    #             'load-error-handling': 'ignore'
    #         }

    #         # Save the file in the 'pdfs' subfolder
    #         pdfkit.from_string(content, os.path.join(f'{self.query}_pdfs', filename), configuration=config, options=options)


    #     # Read the Excel file
    #     df = pd.read_excel(f'{self.query}.xlsx')

    #     # Scrape web pages and save them in pdf format
    #     for index, row in df.iterrows():
    #         content = extract_content(row['link'])
    #         save_content_to_pdf(content, f"web_page_{index}.pdf")
            
    ## Using beautiful soup and weasyprint        
    # def scrape_and_save(self):
        
    #     def extract_content(url):
    #         headers = {
    #             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    #         }
    #         response = requests.get(url, headers=headers)
    #         html = response.content.decode('utf-8')
    #         content = HTML(string=html)
    #         return content

    #     def save_content_to_pdf(content, filename):
    #         # Create a subfolder named 'pdfs' if it doesn't exist
    #         if not os.path.exists(f'{self.query}_pdfs'):
    #             os.makedirs(f'{self.query}_pdfs')

    #         # Save the file in the 'pdfs' subfolder
    #         HTML(string=content).write_pdf(os.path.join(f'{self.query}_pdfs', filename))

    #     # Read the Excel file
    #     df = pd.read_excel(f'{self.query}.xlsx')

    #     # Scrape web pages and save them in pdf format
    #     for index, row in df.iterrows():
    #         content = extract_content(row['link'])
    #         save_content_to_pdf(content, f"web_page_{index}.pdf")
    
    
    
    
    """
    def scrape_summarize_and_save(self):
        df = pd.read_excel(f'{self.query}.xlsx')
        
        for index, row in df.iterrows():
            url = row['link']
            title = row['title']
            if url not in self.visited_urls:
                content = self.extract_content(url)
                summarized_content = self.summarize_text(content)
                keywords = self.extract_keywords(summarized_content)
                structured_content = f"Article: {content}\n <h2>Keywords:</h2> {keywords}\n <h2>Summarized:</h2> {summarized_content}"
                self.save_content_to_pdf(title, url, structured_content, f"web_page_{index}.pdf")
                self.visited_urls.add(url)
            else:
                print(f"Skipping already visited URL: {url}")
                


    def extract_text_from_pdfs_in_folder(self):
        # ... existing code ...

        for pdf_file in pdf_files:
            # ... existing code ...

            # Extract keywords and add to PDF text
            keywords = self.extract_keywords(text)
            text = f"URL: {pdf_file}\nKeywords: {', '.join(keywords)}\n{text}"

            # Summarize text and add to PDF text
            summary = self.summarize_text(text)
            text = f"{text}\n\nSummarized by LLM:\n{summary}"

            # Calculate embedding and store
            embedding = self.get_embedding(summary)
            self.embeddings[pdf_file] = embedding

            # ... existing code ...

        return pdf_texts

    def visualize_text(self):
        # Convert embeddings to matrix
        matrix = np.vstack(self.embeddings.values())

        # Perform clustering
        kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42)
        kmeans.fit(matrix)

        # Create scatter plot of clusters
        plt.scatter(matrix[:, 0], matrix[:, 1], c=kmeans.labels_)
        plt.show()
        
        
            
    def extract_text_from_pdfs_in_folder(self):
        # Get a list of all PDF files in the folder
        pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]

        # Initialize a dictionary to hold the extracted text for each PDF
        pdf_texts = {}

        # Process each PDF file
        for pdf_file in pdf_files:
            # Open the PDF file
            with open(os.path.join(self.folder_path, pdf_file), 'rb') as f:
                # Create a PdfFileReader object
                pdf = PdfReader(f)

                # Initialize a list to hold the text from each page
                text = []

                # Extract the text from each page
                for page in pdf.pages:
                    text.append(page.extract_text())

                # Join the text from all pages into a single string
                pdf_texts[pdf_file] = ' '.join(text)

        return pdf_texts



    def save_summaries(self, summaries):
        pdf = fpdf()
        for title, summary in summaries.items():
            pdf.add_page()
            pdf.set_font("Arial", size = 15)
            pdf.cell(200, 10, txt = title, ln = True, align = 'C')
            pdf.multi_cell(200, 10, txt = summary)
        pdf.output("summarized_webpages.pdf")

# # Usage
# folder_path = 'path_to_your_folder'
# summarizer = PDFSummarizer(folder_path)
# texts = summarizer.extract_text_from_pdfs_in_folder()
# summaries = {title: summarizer.summarize_text(text) for title, text in texts.items()}
# summarizer.save_summaries(summaries)

    # def extract_text_from_pdfs_in_folder(self):
    #     # Get a list of all PDF files in the folder
    #     pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]

    #     # Initialize a dictionary to hold the extracted text for each PDF
    #     pdf_texts = {}

    #     # Process each PDF file
    #     for pdf_file in pdf_files:
    #         # Open the PDF file
    #         with open(os.path.join(self.folder_path, pdf_file), 'rb') as f:
    #             # Create a PdfFileReader object
    #             pdf = PdfFileReader(f)

    #             # Initialize a list to hold the text from each page
    #             text = []

    #             # Extract the text from each page
    #             for page_num in range(pdf.getNumPages()):
    #                 text.append(pdf.getPage(page_num).extractText())

    #             # Join the text from all pages into a single string
    #             pdf_texts[pdf_file] = ' '.join(text)

    #     return pdf_texts

    # def summarize_text(self, text):
    #     # Split the text into chunks of 3000 tokens each
    #     tokens = text.split()
    #     chunks = [' '.join(tokens[i:i + 3000]) for i in range(0, len(tokens), 3000)]

    #     summaries = []
    #     for chunk in chunks:
    #         # Summarize each chunk using the OpenAI API
    #         prompt = f"Please provide a bit detailed summary of the following text with keywords max 300 words:\n{chunk}"
    #         response = openai.Completion.create(
    #             model="text-davinci-003",
    #             prompt=prompt,
    #             max_tokens=300,
    #             n=1,
    #             stop=None,
    #             temperature=0.5,
    #         )
    #         summaries.append(response.choices[0].text.strip())

    #     # Aggregate the summaries
    #     aggregated_summary = ' '.join(summaries)

    #     # Summarize the aggregated summary
    #     prompt = f"The following text are aggregated summaries of a conference. Aggregate the text and produce an report article like Bloomberg article, max 1000 words with core keywords:\n{aggregated_summary}"
    #     response = openai.Completion.create(
    #         model="text-davinci-003",
    #         prompt=prompt,
    #         max_tokens=500,
    #         n=1,
    #         stop=None,
    #         temperature=0.5,
    #     )
    #     final_summary = response.choices[0].text.strip()

    #     return final_summary

    def extract_keywords(self, text):
        n_gram_range = (3, 3)
        stop_words = "english"

        # Extract candidate words/phrases
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
        candidates = count.get_feature_names()

        return candidates

    def get_embedding(self, text, model="text-embedding-ada-002"):
        text = text.replace("\n", " ")
        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

    def process_pdfs(self):
        summaries = []
        keywords = []
        embeddings = []

        for filename in os.listdir(self.folder_path):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(self.folder_path, filename)
                text = self.extract_text_from_pdf(pdf_path)
                summary = self.summarize_text(text)
                keyword = self.extract_keywords(text)
                embedding = self.get_embedding(text)

                summaries.append(summary)
                keywords.append(keyword)
                embeddings.append(embedding)

        df = pd.DataFrame({
            'summary': summaries,
            'keywords': keywords,
            'embedding': embeddings
        })

        return df
"""


"""
    # # Audio Processing

    # def download_audio(self, url):
    #     # Download audio from a given URL. This could be a conference call, interview, etc.
    #     pass

    # def trim_audio(self, audio, start_time, end_time):
    #     # Trim the audio to only include content between the start and end times.
    #     pass

    # def transcribe_audio(self, audio):
    #     # Convert the audio to text using a transcription service or API.
    #     pass

    # Text Processing
    def process_transcription(self, transcription):
        # Process the transcription to extract or generate structured data.
        # This could involve parsing, tagging, or other NLP techniques.
        pass

    def summarize_transcription(self, transcription):
        # Extract the actual transcription text from the dictionary
        transcription_text = transcription.get('transcription', '')
        
        # Split the transcription into chunks of 3000 tokens each
        tokens = transcription_text.split()
        chunks = [' '.join(tokens[i:i + 3000]) for i in range(0, len(tokens), 3000)]

        summaries = []
        for chunk in chunks:
            # Summarize each chunk using the OpenAI API
            prompt = f"Please provide a summary of the following text, a part of extracted text by a web page, max 500 words:\n{chunk}"
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=prompt,
                max_tokens=500,
                n=1,
                stop=None,
                temperature=0.5,
            )
            summaries.append(response.choices[0].text.strip())
            

        # Aggregate the summaries
        aggregated_summary = ' '.join(summaries)

        # Summarize the aggregated summary
        prompt = f"Please aggregate summaries of the following texts and then produce final detail summary max 500 words:\n{aggregated_summary}"
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop=None,
            temperature=0.5,
        )
        final_summary = response.choices[0].text.strip()

        # Save the final summary to a text file
        with open(f'final_summary_{self.youtube_video.title}.txt', 'w') as f:
            f.write(final_summary)
            
"""


In [1]:
import os
import io
import time
import PyPDF2
import requests
from bs4 import BeautifulSoup
from datetime import datetime

import pandas as pd
import numpy as np
import nltk
import ast # for string to list conversion
# from scipy import spatial

import openai
# from openai.embeddings_utils import cosine_similarity

# from weasyprint import HTML
# from selenium import webdriver
import fpdf
import pdfkit
from PyPDF2 import PdfReader, PdfWriter
# import csv

from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture

from PIL import Image
import matplotlib.pyplot as plt
# from reportlab.lib.pagesizes import letter
# from reportlab.lib import colors
# from reportlab.pdfgen import canvas

# nltk.download('punkt')

class AutomatedContentGenerator:
    def __init__(self, api_key, cx, query, num_results, openai_api_key):
        # Initialize any necessary variables, data structures, or connections.
        # This could include a web scraper, data parser, text generator, etc.
        self.api_key = api_key
        self.cx = cx
        self.query = query
        self.num_results = num_results
        self.results = []
        openai.api_key = openai_api_key  # replace with your OpenAI API key
        self.folder_path = f'{self.query}_output'
        os.makedirs(self.folder_path, exist_ok=True)
        self.visited_urls = set()

    ## Step 1: Search Google
    # Web Scraping and Parsing
    def search_and_scrape_web(self):
        # Use a web scraper to find and retrieve relevant information from the web based on the query.
        # This could involve a search engine API or scraping specific sites.
        url = 'https://www.googleapis.com/customsearch/v1'
        params = {
            'key': self.api_key,
            'cx': self.cx,
            'q': self.query,
            'num': self.num_results,
        }
        response = requests.get(url, params=params)
        data = response.json()
        if 'items' in data:
            for item in data['items']:
                self.results.append({
                    'title': item['title'],
                    'link': item['link'],
                    'snippet': item['snippet'],
                    'date': item.get('pagemap', {}).get('metatags', [{}])[0].get('og:updated_time')
                })
    
    def save_search_results_to_excel(self, filename=None):
        if filename is None:
            filename = f'{self.query}.xlsx'
        df = pd.DataFrame(self.results)
        df.to_excel(filename, index=False)
        

    ## Step 2: Scrape web pages and save them in pdf format
    # Using Beautifulsoup and pdfkit
    def extract_content(self, url):
        # Send a GET request to the webpage
        response = requests.get(url)
        
        # Manually set the encoding to 'UTF-8'
        response.encoding = 'UTF-8'

        # Parse the HTML content of the webpage with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        
        # Extract the title of the webpage
        title = f"Title:\n{soup.h1}" if soup.h1 else "No Title"

        # Find the main article content by inspecting the HTML structure
        article = soup.find('main')
        if article is None:
            possible_classes = ['main', 'main-content', 'article_main', 'container', 'head', 'headline', 'panel']
            for class_name in possible_classes:
                article = soup.find('main', class_= class_name)
                if article is not None:
                    break

        # If the main article content still cannot be found, return an empty string
        if article is None:
            print(f"Could not find main content in {url}")
            return ""

        # Extract the text within h2, h3, h4, h5, h6, and p tags
        headers_and_paragraphs = article.find_all(['h2', 'h3', 'h4', 'h5', 'h6', 'p'])

        # Join the text from all tags into a single string with HTML tags
        processed_main_content = ''.join(str(tag) for tag in headers_and_paragraphs)
        
        # Title and the content
        processed_content = f"<h1>{title}</h1>" + processed_main_content
        
        # Limit the content to the first 3000 tokens  
        content = self.limit_content_by_tokens(processed_content, 3000)

        return content
    
    # limit the content by tokens instead of words
    def limit_content_by_tokens(self, content, max_tokens):
        tokens = content.split()
        if len(tokens) > max_tokens:
            tokens = tokens[:max_tokens]
        return ' '.join(tokens)

    
    # # limit the content by tokens instead of words
    # def limit_content_by_tokens(self, content, max_tokens):
    #     tokens = []
    #     token_count = 0

    #     for word in content.split():
    #         word_tokens = len(word.split()) + 1  # Add 1 for the space that follows each word
    #         if token_count + word_tokens > max_tokens:
    #             break
    #         tokens.append(word)
    #         token_count += word_tokens

    #     return ' '.join(tokens)

    # save the content to pdf
    def save_content_to_pdf(self, url, content, filename):
        # Add the title and the link at the beginning of the content
        content_with_link = f"{content}\n\n <h3>URL:</h3>\n<p></p>\n<p><a href='{url}'>{url}</a></p>"

        # # Split the content into words
        # words = content_with_title_and_link.split()

        # # Limit the content to the first 4000 words
        # limited_content = ' '.join(words[:4000])

        if not os.path.exists(self.folder_path):
            os.makedirs(self.folder_path)
        options = {
            'no-stop-slow-scripts': True,
            'load-error-handling': 'ignore',
            'encoding': "UTF-8", ### This is important to solve quatation mark problem
            
        }
        path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
        config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
        
        # # Encode the HTML content as UTF-8 before passing it to pdfkit
        # content_with_keywords_link = content_with_keywords_link.encode('utf-8')
    
        pdfkit.from_string(content_with_link, os.path.join(self.folder_path, filename), configuration=config, options=options) 

    # Loop through the URLs and scrape the content    
    def scrape_and_save(self):
        df = pd.read_excel(f'{self.query}.xlsx')
           
        for index, row in df.iterrows():
            url = row['link']
            # title = row['title']
            if url not in self.visited_urls:
                content = self.extract_content(url)
                pdf_filename = f"web_page_{index}.pdf"
                self.save_content_to_pdf(url, content, pdf_filename)
                self.visited_urls.add(url)
            else:
                print(f"Skipping already visited URL: {url}")
                
            # Sleep for 1 second
            time.sleep(1)
                
                
    def extract_text_from_pdf(self, pdf_path):
        #pdf_file_obj = open(pdf_path, 'rb')
        pdf_reader = PdfReader(pdf_path)
        pdf_content = ''
        
        # for page_num in range(pdf_reader.numPages):
        #     page_obj = pdf_reader.getPage(page_num)
        #     pdf_content += page_obj.extractText()
        # pdf_file_obj.close()
        for page in pdf_reader.pages:
            pdf_content += page.extract_text()
        
        return pdf_content

    def save_sentences_to_csv(self, pdf_content):
        sentences = nltk.sent_tokenize(pdf_content)
        df_sentences = pd.DataFrame(sentences, columns=['text'])
        return df_sentences
    
    def get_embedding(self, text, model):
        text = text.replace("\n", " ")
        if model == 'default_model':
            # use default model to get embedding
            pass
        elif model == 'text-embedding-ada-002':
            # use 'text-embedding-ada-002' model to get embedding
            pass
        else:
            raise ValueError(f"Unknown model: {model}")
        return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']



    def get_embeddings(self, df_sentences):
        df_sentences['embeddings'] = df_sentences['text'].apply(lambda x: self.get_embedding(x, model='text-embedding-ada-002'))
        return df_sentences

    # def get_similarities(self, df_sentences):
    #     df_sentences['similarities'] = df_sentences['embeddings'].apply(lambda x: cosine_similarity(x, df_sentences['embeddings'].tolist()))
    #     return df_sentences
    
    def get_similarities(self, df_sentences):
        embeddings = np.stack(df_sentences['embeddings'].values)
        similarities = cs(embeddings)
        df_sentences['similarities'] = list(similarities)
        return df_sentences

    # # What algorithm should we use to cluster the sentences?
    # def create_clustering_image(self, df_sentences):
    #     matrix = np.vstack(df_sentences['embeddings'].values)
    #     n_clusters = 3
    #     kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
    #     kmeans.fit(matrix)
    #     df_sentences['Cluster'] = kmeans.labels_
        
    #     # Ensure perplexity is less than the number of samples
    #     perplexity = min(15, len(df_sentences) - 1)
        
    #     tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, init="random", learning_rate=200)
    #     vis_dims2 = tsne.fit_transform(matrix)
    #     x = [x for x, y in vis_dims2]
    #     y = [y for x, y in vis_dims2]
    #     fig, ax = plt.subplots()
    #     for category, color in enumerate(["purple", "green", "red", "blue"]):
    #         xs = np.array(x)[df_sentences.Cluster == category]
    #         ys = np.array(y)[df_sentences.Cluster == category]
    #         ax.scatter(xs, ys, color=color, alpha=0.3)
    #         avg_x = xs.mean()
    #         avg_y = ys.mean()
    #         ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
    #     ax.set_title("Clusters identified visualized in language 2d using t-SNE")
    #     fig.canvas.draw()
        
    #     # Create a BytesIO object and save the figure to it in PNG format
    #     buf = io.BytesIO()
    #     fig.savefig(buf, format='png')
    #     buf.seek(0)

    #     # Close the figure
    #     plt.close(fig)

    #     return buf
    
    def create_clustering_image(self, df_sentences):
        if len(df_sentences) < 2:
            print("Not enough sentences to cluster")
            pass
        else:
            matrix = np.vstack(df_sentences['embeddings'].values)
            n_clusters = 3
            gmm = GaussianMixture(n_components=n_clusters, random_state=42)
            gmm.fit(matrix)
            df_sentences['Cluster'] = gmm.predict(matrix)
            
            # Ensure perplexity is less than the number of samples
            perplexity = min(15, len(df_sentences) - 1)
            
            tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, init="random", learning_rate=200)
            vis_dims2 = tsne.fit_transform(matrix)
            x = [x for x, y in vis_dims2]
            y = [y for x, y in vis_dims2]
            fig, ax = plt.subplots()
            for category, color in enumerate(["purple", "green", "red"]):
                xs = np.array(x)[df_sentences.Cluster == category]
                ys = np.array(y)[df_sentences.Cluster == category]
                if len(xs) > 0 and len(ys) > 0:  # Check that the arrays are not empty
                    ax.scatter(xs, ys, color=color, alpha=0.3)
                    avg_x = xs.mean()
                    avg_y = ys.mean()
                    ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
            ax.set_title("Clusters identified visualized in language 2d using t-SNE")
            fig.canvas.draw()
            
            # Create a BytesIO object and save the figure to it in PNG format
            buf = io.BytesIO()
            fig.savefig(buf, format='png')
            buf.seek(0)

            # Close the figure
            plt.close(fig)

            return buf

    # def add_image_to_pdf(self, pdf_path, image_buf):
    #     pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]
        
    #     pass
    
    # def process_openai_embeddings_csv(self):
    #     # Get a list of all PDF files in the folder
    #     pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]
    #     for pdf_file in pdf_files:
    #         pdf_path = os.path.join(self.folder_path, pdf_file)
    #         pdf_content = self.extract_text_from_pdf(pdf_path)
    #         df_sentences = self.save_sentences_to_csv(pdf_content)
    #         df_sentences = self.get_embeddings(df_sentences)
    #         df_sentences = self.get_similarities(df_sentences)
    #         df_sentences.to_csv(os.path.join(self.folder_path, f'{self.query}_openai.csv'), index=False)

    #         # Sleep for 1 second
    #         time.sleep(1)
    
    def process_pdfs_and_save_embeddings(self):
        # Get a list of all PDF files in the folder
        pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]
        for pdf_file in pdf_files:
            pdf_path = os.path.join(self.folder_path, pdf_file)
            pdf_content = self.extract_text_from_pdf(pdf_path)
            df_sentences = self.save_sentences_to_csv(pdf_content)
            df_sentences = self.get_embeddings(df_sentences)
            df_sentences = self.get_similarities(df_sentences)
            df_sentences.to_csv(os.path.join(self.folder_path, f'{pdf_file}_embeddings.csv'), index=False)
            
            # Sleep for 1 second
            time.sleep(1)
            
    
    def create_clustering_images_from_saved_embeddings(self):
        # Get a list of all CSV files in the folder
        csv_files = [f for f in os.listdir(self.folder_path) if f.endswith('_embeddings.csv')]
        for csv_file in csv_files:
            csv_path = os.path.join(self.folder_path, csv_file)
            df_sentences = pd.read_csv(csv_path)
            
            # Convert the embeddings from strings back to lists of floats
            df_sentences['embeddings'] = df_sentences['embeddings'].apply(ast.literal_eval).apply(lambda x: [float(i) for i in x])
            
            # Create the clustering image
            image_buf = self.create_clustering_image(df_sentences)
            
            if image_buf is not None:
                # Convert the image buffer to a PIL image
                image = Image.open(image_buf)
                
                # Convert image to RGB if it's RGBA
                if image.mode == 'RGBA':
                    image = image.convert('RGB')
                
                # Save the image as a JPEG file
                image.save(os.path.join(self.folder_path, f'{csv_file}_cluster.jpeg'))
                
                # Sleep for 1 second
                time.sleep(1)
            
    # def execute_create_clustering_image(self):
    #     # Load the sentences from the CSV file
    #     df_sentences = pd.read_csv(os.path.join(self.folder_path, f'{self.query}_openai.csv'))
        
    #     # Convert the embeddings from strings back to lists of floats
    #     df_sentences['embeddings'] = df_sentences['embeddings'].apply(ast.literal_eval).apply(lambda x: [float(i) for i in x])
        
    #     # Create the clustering image
    #     image_buf = self.create_clustering_image(df_sentences)
        
    #     # Convert the image buffer to a PIL image
    #     image = Image.open(image_buf)
            
    #     # Convert image to RGB if it's RGBA
    #     if image.mode == 'RGBA':
    #         image = image.convert('RGB')
            
    #     # Save the image as a JPEG file
    #     image.save(os.path.join(self.folder_path, f'{self.query}_clustering.jpeg'))

                        
    # def execute_create_clustering_image(self):
    #     # Load the sentences from the CSV file
    #     df_sentences = pd.read_csv(os.path.join(self.folder_path, f'{self.query}_openai.csv'))
        
    #     # Create the clustering image
    #     image_buf = self.create_clustering_image(df_sentences)
        
    #     # Convert the image buffer to a PIL image
    #     image = Image.open(image_buf)
            
    #     # Convert image to RGB if it's RGBA
    #     if image.mode == 'RGBA':
    #         image = image.convert('RGB')
            
    #     # Save the image as a JPEG file
    #     image.save(os.path.join(self.folder_path, f'{self.query}_clustering.jpeg'))
        
        
    # def process_saved_pdfs(self):
    #     # Get a list of all PDF files in the folder
    #     pdf_files = [f for f in os.listdir(self.folder_path) if f.endswith('.pdf')]

    #     for pdf_file in pdf_files:
    #         # Extract text from the PDF file
    #         pdf_content = self.extract_text_from_pdf(os.path.join(self.folder_path, pdf_file))

    #         # Split the content into sentences
    #         sentences = nltk.sent_tokenize(pdf_content)

    #         # Get embeddings for each sentence
    #         embeddings = [self.get_embedding(sentence) for sentence in sentences]

    #         # Create a DataFrame with the sentences and their embeddings
    #         df_sentences = pd.DataFrame({
    #             'index': range(len(sentences)),
    #             'text': sentences,
    #             'embeddings': embeddings
    #         })

    #         # Calculate similarities between the sentences
    #         df_sentences['similarities'] = df_sentences['embeddings'].apply(lambda x: cosine_similarity(x, embeddings))

    #         # Save the DataFrame to a CSV file
    #         df_sentences.to_csv(os.path.join(self.folder_path, f'{self.query}_openai.csv'), index=False)
            
    #         keywords = self.extract_keywords(pdf_content)
    #         summary = self.summarize_text(pdf_content)
    #         self.save_summary_and_keywords_to_pdf(url, row['title'], keywords, summary)


    
    # def get_embedding(self, pdf_content, filename):
        
    #     if not os.path.exists(f'{self.query}_pdfs'):
    #         os.makedirs(f'{self.query}_pdfs')
        
    #     # Split the content into sentences
    #     sentences = nltk.sent_tokenize(pdf_content)
    #     embeddings = []
    #     for sentence in sentences:
    #         sentence = sentence.replace("\n", " ")
    #         result = openai.Embedding.create(input=[sentence], model="text-embedding-ada-002")['data'][0]['embedding']
    #         embeddings.append(result)
    #     df = pd.DataFrame(embeddings)
    #     df['embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    #     df.to_excel(filename, index=False)
        
    #            
    def extract_keywords(self, pdf_content):
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=f"Extract main 10 keywords from the following text as #Keywords form:\n\n{pdf_content}\n\n tl;dr:",
            temperature=0.5,
            max_tokens=200,
            top_p=1.0,
            frequency_penalty=0.8,
            presence_penalty=0.0
        )
        keywords = response.choices[0].text.strip().split(', ')
        return keywords         
    
    def summarize_text(self, pdf_content):
        # Split the text into chunks of 1000 tokens each
        tokens = pdf_content.split()
        chunks = [' '.join(tokens[i:i + 2000]) for i in range(0, len(tokens), 2000)]

        summaries = []
        for chunk in chunks:
            # Summarize each chunk using the OpenAI API
            prompt = f"Please provide a detailed summary of the following text with keywords max 500 words:\n{chunk}\n\n tl;dr:"
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=prompt,
                max_tokens=2000,
                n=1,
                stop=None,
                temperature=0.5,
            )
            summaries.append(response.choices[0].text.strip())

        # Aggregate the summaries
        aggregated_summary = ' '.join(summaries)

        return aggregated_summary
           
    # def summarize_text(self, pdf_content):
    #     # Split the text into chunks of 2000 tokens each
    #     tokens = pdf_content.split()
    #     # chunks = [' '.join(tokens[i:i + 1000]) for i in range(0, len(tokens), 1000)]

    #     # summaries = []
    #     # for chunk in chunks:
    #     #     # Summarize each chunk using the OpenAI API
    #     #     prompt = f"Please provide a bit detailed summary of the following text with keywords max 300 words:\n{chunk}\n\n tl;dr:"
    #     #     response = openai.Completion.create(
    #     #         model="text-davinci-003",
    #     #         prompt=prompt,
    #     #         max_tokens=1000,
    #     #         n=1,
    #     #         stop=None,
    #     #         temperature=0.5,
    #     #     )
    #     #     summaries.append(response.choices[0].text.strip())

    #     # # Aggregate the summaries
    #     # aggregated_summary = ' '.join(summaries)

    #     # Summarize the aggregated summary
    #     prompt = f"The following text is scrapped text of a webpage. Summarize the text and produce an summary ariticle like Bloomberg or Harvard Business Review, max 600 words with core keywords:\n{tokens}\n\n tl;dr:"
    #     response = openai.Completion.create(
    #         model="text-davinci-003",
    #         prompt=prompt,
    #         max_tokens=3000,
    #         n=1,
    #         stop=None,
    #         temperature=0.5,
    #     )
    #     summary = response.choices[0].text.strip()

    #     return summary
    
    # def save_summary_and_keywords_to_pdf(self, url, title, keywords, summary):
    #     content = f"<h1>{title}</h1>\n\n<h2>Keywords:</h2>\n<p>{', '.join(keywords)}</p>\n\n<h2>Summary:</h2>\n<p>{summary}</p>\n\n<h3>URL:</h3>\n<p><a href='{url}'>{url}</a></p>"
    #     if not os.path.exists(self.folder_path):
    #         os.makedirs(self.folder_path)
    #     options = {
    #         'no-stop-slow-scripts': True,
    #         'load-error-handling': 'ignore',
    #         'encoding': "UTF-8",
    #     }
    #     path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
    #     config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
    #     pdfkit.from_string(content, os.path.join(self.folder_path, 'output.pdf'), configuration=config, options=options) 


    def save_summary_and_keywords_to_pdf(self, url, title, keywords, summary):
        content = f"<h1>{title}</h1>\n\n<h2>Keywords:</h2>\n<p>{', '.join(keywords)}</p>\n\n<h2>Summary:</h2>\n<p>{summary}</p>\n\n<h3>URL:</h3>\n<p><a href='{url}'>{url}</a></p>"
        if not os.path.exists(self.folder_path):
            os.makedirs(self.folder_path)
        options = {
            'no-stop-slow-scripts': True,
            'load-error-handling': 'ignore',
            'encoding': "UTF-8",
        }
        path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
        config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
        pdfkit.from_string(content, os.path.join(self.folder_path, 'output.pdf'), configuration=config, options=options)

        # If the summary PDF already exists, append the new summary to it
        pdf_filename = f"{self.query}_summary.pdf"
        pdf_path = os.path.join(self.folder_path, pdf_filename)
        if os.path.exists(pdf_path):
            writer = PdfWriter()

            # Existing PDF
            existing_pdf = PdfReader(open(pdf_path, "rb"))
            for page in existing_pdf.pages:
                writer.add_page(page)

            # New PDF
            new_pdf = PdfReader(open(os.path.join(self.folder_path, 'output.pdf'), "rb"))
            writer.add_page(new_pdf.pages[0])

            # Write the output PDF
            with open(pdf_path, "wb") as f:
                writer.write(f)
        else:
            # If the summary PDF doesn't exist, create a new one
            os.rename(os.path.join(self.folder_path, 'output.pdf'), pdf_path)
    

    def process_saved_pdfs(self):
        df = pd.read_excel(f'{self.query}.xlsx')
        for index, row in df.iterrows():
            url = row['link']
            pdf_filename = f"web_page_{index}.pdf"
            pdf_path = os.path.join(self.folder_path, pdf_filename)
            pdf_content = self.extract_text_from_pdf(pdf_path)
            keywords = self.extract_keywords(pdf_content)

            # Split the PDF content into chunks of 2000 tokens each
            tokens = pdf_content.split()
            chunks = [' '.join(tokens[i:i + 2000]) for i in range(0, len(tokens), 2000)]

            # Summarize each chunk separately
            summaries = []
            for chunk in chunks:
                summary = self.summarize_text(chunk)
                summaries.append(summary)

            # Aggregate the summaries
            aggregated_summary = ' '.join(summaries)
            
            final_summary = self.summarize_text(aggregated_summary)
            
            # Extract the title from the PDF content
            # title_start = pdf_content.find('Title:') + len('Title:')
            # title_end = pdf_content.find('\n', title_start)
            # title = pdf_content[title_start:title_end].strip()

            # Save the summary and keywords to a PDF
            self.save_summary_and_keywords_to_pdf(url, row['title'], keywords, final_summary)

            # Sleep for 1 second
            time.sleep(1)
            
    def generate_article(self):
        pdf_filename = f"{self.query}_summary.pdf"
        pdf_path = os.path.join(self.folder_path, pdf_filename)
        # news_title = input("Please enter the title of the news: ")
        
        # Get the current date
        today = datetime.now().date()
        
        if os.path.exists(pdf_path):
            pdf_content = self.extract_text_from_pdf(pdf_path)
            
            # # Split the text into chunks of 2000 tokens each
            # tokens = pdf_content.split()
            # chunks = [' '.join(tokens[i:i + 2000]) for i in range(0, len(tokens), 2000)]
            
            # Produce an professional and detailed journalism article like Harvard Business Review by using resources below, around 2000 tokens length. Also do paragraphing and extract core keywords from the article created.
            prompt = f"The following text is scrapped text from multiple webpages as the results of {self.query}. \n Produce a total professional summary from the text below around 1000 words:\n{pdf_content}\n"
            response = openai.Completion.create(
                    model="text-davinci-003",
                    prompt=prompt,
                    max_tokens=2000,
                    n=1,
                    stop=None,
                    temperature=0.5,
                )

            # summaries = []
            # for chunk in chunks:
            #     # Summarize each chunk using the OpenAI API
            #     prompt = f"The following text is scrapped text of a webpage. Summarize the text and produce an summary article like Bloomberg or Harvard Business Review, max 600 words with core keywords:\n{chunk}\n\n tl;dr:"
            #     response = openai.Completion.create(
            #         model="text-davinci-003",
            #         prompt=prompt,
            #         max_tokens=1000,
            #         n=1,
            #         stop=None,
            #         temperature=0.5,
            #     )
                # summaries.append(response.choices[0].text.strip())

            # # Aggregate the summaries
            # generated_article = ' '.join(summaries)
            
            generated_article = response.choices[0].text.strip()
            
            news_article = f"<h1>Title: The Total Summary of {self.query} </h1> \n <h4> Date: {today} </h4> \n\n <p>{generated_article}<p>"
            
            options = {
                'no-stop-slow-scripts': True,
                'load-error-handling': 'ignore',
                'encoding': "UTF-8",
            }
            path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
            config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
            pdfkit.from_string(news_article, os.path.join(self.folder_path, f"{self.query}_final.pdf"), configuration=config, options=options)


    # def process_saved_pdfs(self):
    #     df = pd.read_excel(f'{self.query}.xlsx')
    #     for index, row in df.iterrows():
    #         url = row['link']
    #         pdf_filename = f"web_page_{index}.pdf"
    #         pdf_path = os.path.join(self.folder_path, pdf_filename)
    #         pdf_content = self.extract_text_from_pdf(pdf_path)
    #         keywords = self.extract_keywords(pdf_content)
    #         summary = self.summarize_text(pdf_content)

    #         # Create a PDF object
    #         pdf = fpdf()

    #         # Add a page
    #         pdf.add_page()

    #         # Set font
    #         pdf.set_font("Arial", size = 12)

    #         # Add a cell
    #         pdf.cell(200, 10, txt = f"URL: {url}", ln = True, align = 'L')
    #         pdf.cell(200, 10, txt = f"Title: {title}", ln = True, align = 'L')
    #         pdf.cell(200, 10, txt = f"Keywords: {', '.join(keywords)}", ln = True, align = 'L')
    #         pdf.cell(200, 10, txt = f"Summary: {summary}", ln = True, align = 'L')

    #         # Save the pdf with name .pdf
    #         pdf_filename = f"{self.query}_summary.pdf"
    #         pdf_path = os.path.join(self.folder_path, pdf_filename)
            
    #         if os.path.exists(pdf_path):
    #             # If the summary PDF already exists, append the new summary to it
    #             writer = PdfFileWriter()

    #             # Existing PDF
    #             existing_pdf = PdfFileReader(open(pdf_path, "rb"))
    #             writer.addPage(existing_pdf.getPage(0))

    #             # New PDF
    #             new_pdf = PdfFileReader(pdf)
    #             writer.addPage(new_pdf.getPage(0))

    #             with open(pdf_path, "wb") as f:
    #                 writer.write(f)
    #         else:
    #             # If the summary PDF doesn't exist, create a new one
    #             pdf.output(pdf_path)
                
    #         # Sleep for 1 second
    #         time.sleep(1)
    
    # def process_saved_pdfs(self):
    #     df = pd.read_excel(f'{self.query}.xlsx')
    #     for index, row in df.iterrows():
    #         url = row['link']
    #         pdf_filename = f"web_page_{index}.pdf"
    #         pdf_path = os.path.join(self.folder_path, pdf_filename)
    #         pdf_content = self.extract_text_from_pdf(pdf_path)
    #         keywords = self.extract_keywords(pdf_content)
    #         summary = self.summarize_text(pdf_content)
    #         self.save_summary_and_keywords_to_pdf(url, row['title'], keywords, summary)
            
    #         # Sleep for 1 second
    #         time.sleep(1)

                



    def handle_structured_data(self, data):
        # Process the scraped data to extract or generate structured data.
        # This could include parsing HTML, handling databases or spreadsheets, etc.
        pass

    # Content Generation

    def personalize_content(self, user_profile, content):
        # Adjust the content based on the user's profile or preferences.
        # This could involve filtering, sorting, or generating new content.
        pass

    def fact_check(self, content):
        # Verify the accuracy of the content. This could involve cross-referencing sources,
        # using fact-checking APIs, or similar techniques.
        pass

    def produce_article(self, structured_data):
        # Generate a coherent article or report from the structured data.
        # This could involve natural language generation techniques.
        pass

    # Testing and Feedback

    def test_article(self, article):
        # Test the generated article for readability, accuracy, relevance, etc.
        # This could involve NLP metrics, user feedback, or other tests.
        pass



In [2]:
# Create an instance of the AutomatedContentGenerator class
acg = AutomatedContentGenerator(api_key='AIzaSyCIGuLmSrM65sXknalTE4B4x8PsCZpAZ-I', cx='d4c72f123415549d9', query='chatgpt use cases 2023 news', num_results=5, openai_api_key='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')


In [5]:
acg.search_and_scrape_web()

In [6]:
acg.save_search_results_to_excel()

In [7]:
acg.scrape_and_save()

Could not find main content in https://research.aimultiple.com/chatgpt-use-cases/
Could not find main content in https://www.bloomberg.com/news/articles/2023-04-18/ai-therapy-becomes-new-use-case-for-chatgpt


In [137]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaden\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
acg.process_pdfs_and_save_embeddings()

In [26]:
acg.create_clustering_images_from_saved_embeddings()

Not enough sentences to cluster


AttributeError: 'NoneType' object has no attribute 'read'

In [14]:
acg.process_saved_pdfs()

In [3]:
acg.generate_article()

In [None]:
len('Title:')

6

In [26]:
dir(PyPDF2)

['DocumentInformation',
 'PageObject',
 'PageRange',
 'PaperSize',
 'PasswordType',
 'PdfFileMerger',
 'PdfFileReader',
 'PdfFileWriter',
 'PdfMerger',
 'PdfReader',
 'PdfWriter',
 'Transformation',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_cmap',
 '_codecs',
 '_encryption',
 '_merger',
 '_page',
 '_protocols',
 '_reader',
 '_security',
 '_utils',
 '_version',
 '_writer',
 'constants',
 'errors',
 'filters',
 'generic',
 'pagerange',
 'papersizes',
 'parse_filename_page_ranges',
 'types',
 'xmp']

In [36]:
import requests

url = 'https://www.salesforce.com/news/press-releases/2023/03/07/einstein-generative-ai/'
response = requests.get(url)

print(response.headers['Content-Type'])


text/html; charset=UTF-8


In [34]:
word = "word"
len(word)



4

In [12]:
print(acg.scrape_summarize_and_save_beta)

<bound method AutomatedContentGenerator.scrape_summarize_and_save_beta of <__main__.AutomatedContentGenerator object at 0x0000029DF1D0A810>>


In [25]:
# Scrape web pages and save them as PDFs
acg.scrape_summarize_and_save()

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4418 tokens (3818 in your prompt; 600 for the completion). Please reduce your prompt; or completion length.

In [7]:
# Extract text from the scraped PDFs
pdf_texts = acg.extract_text_from_pdfs_in_folder()

In [8]:
# Generate summaries of the extracted text
summaries = {title: acg.summarize_text(text) for title, text in pdf_texts.items()}


Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)

In [None]:
# Save the summaries to a PDF file
acg.save_summaries(summaries)

In [12]:
import openai
import pandas as pd
from pytube import YouTube
from moviepy.editor import AudioFileClip
import whisper

class ConferenceSummarizer:
    def __init__(self, api_key, cx, query, num_results, openai_api_key):
        self.api_key = api_key
        self.cx = cx
        self.query = query
        self.num_results = num_results
        self.results = []
        openai.api_key = openai_api_key  # replace with your OpenAI API key
        
    def download_youtube_audio(self, youtube_video_url):
        self.youtube_video = YouTube(youtube_video_url)
        streams = self.youtube_video.streams.filter(only_audio=True)
        stream = streams.first()
        stream.download(filename=f'{self.youtube_video.title}.mp4')
        return f'{self.youtube_video.title}.mp4'
    
    def trim_audio(self, start_time, end_time):
        # Load the audio file with moviepy
        audio_clip = AudioFileClip(f'{self.youtube_video.title}.mp4')

        # Trim the audio
        trimmed_clip = audio_clip.subclip(start_time, end_time)

        # Write the result to a file
        trimmed_clip.write_audiofile(f'trimmed_{self.youtube_video.title}.mp3')
    
    def transcribe_audio(self):
        model = whisper.load_model('base')
        output = model.transcribe(f'trimmed_{self.youtube_video.title}.mp3')
        return output
    
    def summarize_transcription(self, transcription):
        # Extract the actual transcription text from the dictionary
        transcription_text = transcription.get('transcription', '')
        
        # Split the transcription into chunks of 3000 tokens each
        tokens = transcription_text.split()
        chunks = [' '.join(tokens[i:i + 3000]) for i in range(0, len(tokens), 3000)]

        summaries = []
        for chunk in chunks:
            # Summarize each chunk using the OpenAI API
            prompt = f"Please provide a bit detailed summary of the following text with keywords max 500 words:\n{chunk}"
            response = openai.Completion.create(
                model="text-davinci-003",
                prompt=prompt,
                max_tokens=1000,
                n=1,
                stop=None,
                temperature=0.5,
            )
            summaries.append(response.choices[0].text.strip())

        # Aggregate the summaries
        aggregated_summary = ' '.join(summaries)

        # Summarize the aggregated summary
        prompt = f"The following text are aggregated summaries of a conference. Aggregate the text and produce an report article like Bloomberg article, max 1000 words with core keywords:\n{aggregated_summary}"
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            max_tokens=2000,
            n=1,
            stop=None,
            temperature=0.5,
        )
        final_summary = response.choices[0].text.strip()

        # Save the final summary to a text file
        with open(f'final_summary_{self.youtube_video.title}.txt', 'w') as f:
            f.write(final_summary)


In [None]:
dir(AudioFileClip.write_audiofile)

In [13]:
summarizer = ConferenceSummarizer(api_key='AIzaSyCIGuLmSrM65sXknalTE4B4x8PsCZpAZ-I', cx='d4c72f123415549d9', query='fomc conference 2023', num_results=5, openai_api_key='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')

In [15]:
youtube_video_url = 'https://www.youtube.com/watch?v=ifqyTQ0Ifrw&t=855s'
summarizer.download_youtube_audio(youtube_video_url)

'FOMC Press Conference, May 3, 2023.mp4'

In [16]:
# Trim the audio (optional)
start_time = 1  # replace with the start time in seconds
end_time = 440  # replace with the end time in seconds
summarizer.trim_audio(start_time, end_time)

MoviePy - Writing audio in trimmed_FOMC Press Conference, May 3, 2023.mp3


                                                                      

MoviePy - Done.




In [17]:
# Transcribe the audio
transcription = summarizer.transcribe_audio()

In [22]:
print(transcription)

{'text': " Good afternoon. Before discussing today's meeting, let me comment briefly on recent developments in the banking sector. Conditions in that sector have broadly improved since early March, and the U.S. banking system is sound and resilient. We will continue to monitor conditions in the sector. We are committed to learning the right lessons from this episode, and will work to prevent events like these from happening again. As a first step in that process, last week we released Vice Chair for Supervision and Bars review of the Federal Reserve's supervision and regulation of Silicon Valley Bank. The reviews findings underscore the need to address our rules and supervisory practices to make for a stronger and more resilient banking system, and I'm confident that we will do so. From the perspective of monetary policy, our focus remains squarely on our dual mandate to promote maximum employment and stable prices for the American people. My colleagues and I understand the hardship th

In [23]:
# Summarize the transcription
summarizer.summarize_transcription(transcription)

In [20]:
import pandas as pd
df = pd.DataFrame(transcription)
df.to_excel('transcription.xlsx')

In [10]:
# Create an instance of the ConferenceSummarizer class
summarizer = ConferenceSummarizer(api_key='AIzaSyCIGuLmSrM65sXknalTE4B4x8PsCZpAZ-I', cx='d4c72f123415549d9', query='fomc conference 2023', num_results=5, openai_api_key='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')

# Download the audio from a YouTube video
youtube_video_url = 'https://www.youtube.com/watch?v=ifqyTQ0Ifrw&t=855s'  # replace with your YouTube video URL
summarizer.download_youtube_audio(youtube_video_url)

# Trim the audio (optional)
start_time = 1  # replace with the start time in seconds
end_time = 440  # replace with the end time in seconds
summarizer.trim_audio(start_time, end_time)

# Transcribe the audio
transcription = summarizer.transcribe_audio()

# Summarize the transcription
summarizer.summarize_transcription(transcription)


MoviePy - Writing audio in trimmed_FOMC Press Conference, May 3, 2023.mp3


                                                                      

MoviePy - Done.
