1 **Install required packages**

In [None]:
!pip install openai
!pip install pdf2image pytesseract
!brew install poppler
!apt-get install -y poppler-utils
!apt install tesseract-ocr

In [None]:
!pip install openai==0.28

2. **Define Summarizer class -- A Input file, B String**

In [None]:
import openai
from pdf2image import convert_from_path
import pytesseract
from tkinter import filedialog

class GPTSummarizer:
    def __init__(self, api_key, max_input_length=4096):  # Updated default max length for gpt-3.5-turbo
        self.api_key = api_key
        self.max_input_length = max_input_length
        openai.api_key = self.api_key

    def _cut_input(self, text):
        if len(text) <= self.max_input_length:
            return [text]

        chunks = []
        while len(text) > 0:
            chunk = text[:self.max_input_length]
            text = text[self.max_input_length:]
            chunks.append(chunk)

        return chunks

    def summarize(self, text):
        chunks = self._cut_input(text)
        summaries = []
        for chunk in chunks:
            summarized_chunk = self._summarize_chunk(chunk)
            summaries.append(summarized_chunk)
        return summaries

    def _summarize_chunk(self, text):
        messages = [{"role": "system", "content": "You are a helpful summarizer"},
                    {"role": "user", "content": f"Summarize the following: {text}"}]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )

        return response.choices[0].message['content']


    def read_from_file(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()
        return content


    def extract_text_from_pdf(pdf_path):
        images = convert_from_path(pdf_path)
        text = ""

        for image in images:
            text += pytesseract.image_to_string(image)

        return text

    def is_pdf(file_path):
        # Check based on file extension
        if not file_path.lower().endswith('.pdf'):
            return False

        # Check based on file header
        with open(file_path, 'rb') as file:
            header = file.read(4)
        return header == b'%PDF'

    def combine_strings(string_list):
        return ''.join(string_list)

# Test

#MEthod return true if given file path is a pdf
# is_pdf_file = GPTSummarizer.is_pdf('path_to_your_file')
# print(is_pdf_file)  # True if it's a PDF, False otherwise



# Method to read a .txt file
# file_content = GPTSummarizer.read_from_file('path_to_your_file.txt')
# print(file_content)


# Method to read a pdf file
# pdf_content = GPTSummarizer.extract_text_from_pdf('path_to_your_file.pdf')
# print(pdf_content)




#This is for testing put input string here
# text_to_summarize = ("The solar system consists of the Sun and the objects that orbit it."
#                     "These objects include planets, moons, asteroids, comets, and more. "
#                     "The solar system is vast, with the Sun containing over 99.8% of its total mass.")


#text_to_summarize = is where you put the input string for the summarizer
# summaries = summarizer.summarize(text_to_summarize)
# print(summaries)


#to select file from computer
#file_path = filedialog.askopenfilename(title="Select a document", filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")])
#print(file_path)

summarizer = GPTSummarizer('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')


#is_pdf_file = GPTSummarizer.is_pdf(file_path)
is_pdf_file = GPTSummarizer.is_pdf("/content/1981GrahamFourier.pdf")
file_path = "/content/1981GrahamFourier.pdf"


#This is for when the person want to input the string to be send the summarizer instead of inputing a file
input_text = False

if input_text:
  text = input("Input string: ")
  try:
    summaries = summarizer.summarize(text)
    cs = GPTSummarizer.combine_strings(summaries)

    print(cs)
  except:
    print("gpt error retrying")
    summaries = summarizer.summarize(text)
    cs = GPTSummarizer.combine_strings(summaries)

    print(cs)
else:

  if is_pdf_file:
      pdf_content = GPTSummarizer.extract_text_from_pdf(file_path )

      print("PDF to text complete")
      print("-"*20)
      print(pdf_content)
      print("-"*20)
      try:
        summaries = summarizer.summarize(pdf_content)
        cs = GPTSummarizer.combine_strings(summaries)

        print(cs)
      except:
        print("gpt error retrying")
        summaries = summarizer.summarize(pdf_content)
        cs = GPTSummarizer.combine_strings(summaries)

        print(cs)

  else:
      file_content = GPTSummarizer.read_from_file(file_path)
      print(file_content)
      try:
        summaries = summarizer.summarize(file_content)
        cs = GPTSummarizer.combine_strings(summaries)

        print(cs)
      except:
        print("gpt error retrying")
        summaries = summarizer.summarize(file_content)
        cs = GPTSummarizer.combine_strings(summaries)

        print(cs)




In [None]:
!ls

In [5]:
from numpy.lib.npyio import savez_compressed
import openai
from pdf2image import convert_from_path
import pytesseract
from tkinter import filedialog
import os

# -- A) Input file

class GPTSummarizer:
    def __init__(self, api_key, max_input_length=4096):  # Updated default max length for gpt-3.5-turbo
        self.api_key = api_key
        self.max_input_length = max_input_length
        openai.api_key = self.api_key

    def _cut_input(self, text):
        if len(text) <= self.max_input_length:
            return [text]

        chunks = []
        while len(text) > 0:
            chunk = text[:self.max_input_length]
            text = text[self.max_input_length:]
            chunks.append(chunk)

        return chunks

    def summarize(self, text):
        chunks = self._cut_input(text)
        summaries = []
        for chunk in chunks:
            summarized_chunk = self._summarize_chunk(chunk)
            summaries.append(summarized_chunk)
        return summaries

    def _summarize_chunk(self, text):
        messages = [{"role": "system", "content": "You are a helpful summarizer. Give me bullet point summarization and keypoints to know"},
                    {"role": "user", "content": f"Summarize the following: {text}"}]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )

        return response.choices[0].message['content']


    def read_from_file(filename):
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read()
        return content


    def extract_text_from_pdf(pdf_path):
        images = convert_from_path(pdf_path)
        text = ""

        for image in images:
            text += pytesseract.image_to_string(image)

        return text

    def is_pdf(file_path):
        # Check based on file extension
        if not file_path.lower().endswith('.pdf'):
            return False

        # Check based on file header
        with open(file_path, 'rb') as file:
            header = file.read(4)
        return header == b'%PDF'

    def combine_strings(string_list):
        return ''.join(string_list)

# Test

#MEthod return true if given file path is a pdf
# is_pdf_file = GPTSummarizer.is_pdf('path_to_your_file')
# print(is_pdf_file)  # True if it's a PDF, False otherwise



# Method to read a .txt file
# file_content = GPTSummarizer.read_from_file('path_to_your_file.txt')
# print(file_content)


# Method to read a pdf file
# pdf_content = GPTSummarizer.extract_text_from_pdf('path_to_your_file.pdf')
# print(pdf_content)




#This is for testing put input string here
# text_to_summarize = ("The solar system consists of the Sun and the objects that orbit it."
#                     "These objects include planets, moons, asteroids, comets, and more. "
#                     "The solar system is vast, with the Sun containing over 99.8% of its total mass.")


#text_to_summarize = is where you put the input string for the summarizer
# summaries = summarizer.summarize(text_to_summarize)
# print(summaries)


#to select file from computer
#file_path = filedialog.askopenfilename(title="Select a document", filetypes=[("PDF files", "*.pdf"), ("All files", "*.*")])
#print(file_path)

summarizer = GPTSummarizer('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')


#is_pdf_file = GPTSummarizer.is_pdf(file_path)


# -- B) This is for when the person want to input the string to be send the summarizer instead of inputing a file
input_text = False

save = []
folder_path = '/content/mytest'
filenames = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
summarizer = GPTSummarizer('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')

for filename in filenames:
    file_path = os.path.join(folder_path, filename)


    pdf_content = GPTSummarizer.extract_text_from_pdf(file_path)

    try:
        summaries = summarizer.summarize(pdf_content)
        cs = GPTSummarizer.combine_strings(summaries)
        save.append(cs)
    except:
        print("GPT error, retrying...")
        summaries = summarizer.summarize(pdf_content)
        cs = GPTSummarizer.combine_strings(summaries)
        save.append(cs)


print(save)



['- The human visual system contains multiple spatial-frequency channels, each sensitive to a different range of spatial frequencies in visual patterns.\n- The existence of visual neurons with different sizes of receptive fields has important implications for pattern vision.\n- There are multiple size-selective channels in the human visual system.\n- The visual system does a Fourier analysis of the visual scene.\n- The history of the multiple-channels model and its current status is reviewed.\n- Mathematics has played a role in the development of the multiple-channels model and could be further utilized.\n- The single-channel model of pattern vision developed into the multiple spatial-frequency channels model.\n- Vertebrate retinal ganglion cells respond to light in a broad area called the "receptive field."\n- Neurons respond differently depending on where in the receptive field the light falls.\n- The response of these neurons to light patterns is approximately linear.\n- The respons

In [6]:
for x in save:
  print(x)

- The human visual system contains multiple spatial-frequency channels, each sensitive to a different range of spatial frequencies in visual patterns.
- The existence of visual neurons with different sizes of receptive fields has important implications for pattern vision.
- There are multiple size-selective channels in the human visual system.
- The visual system does a Fourier analysis of the visual scene.
- The history of the multiple-channels model and its current status is reviewed.
- Mathematics has played a role in the development of the multiple-channels model and could be further utilized.
- The single-channel model of pattern vision developed into the multiple spatial-frequency channels model.
- Vertebrate retinal ganglion cells respond to light in a broad area called the "receptive field."
- Neurons respond differently depending on where in the receptive field the light falls.
- The response of these neurons to light patterns is approximately linear.
- The response to two spo