* Click on 'Files' on the side menu and upload input.csv with the links inserted in column A.
* To avoid errors, try not to have more than 50-60 links in the input file.
* If you have more links, you can run the code multiple times by updating the input.csv file each time.

**This is the only block of code you would have to make changes (depending on the use case) in:**

In [None]:
#To modify the prompt, change the content within the quotation marks in the value of the prompt variable below.
prompt = """
Go through the document and share with me the following in JSON format (This is important. I want to avoid errors later when I try to parse the output):
      1. Language
      2. Summary in a paragraph with five sentences. While writing the summary, please assume that I already know about the Spotlight initiative. Do not define it for me again.
      3. Target Stakeholders (For this, use one of more of following: 'General audience', 'Policymakers', 'Funders', 'CSOs', 'Communities', 'Private Sector', 'Coalitions', 'Spotlight teams', 'Service providers (e.g., care centers)', 'Multilateral/ bilateral organizations', 'Others'). Please know that service providers here would also include law enforcement/ police/ courts etc.
      4. Target Geography (choose a country or a region that includes multiple countries like South East Asia. If it is not restricted to a single region, 'Global' will be the correct option). Do not provide an explanation.
      5. Name of the Funder (Name of organizations like UNDP, UNFPA etc.). Do not provide any explanation. Leave blank if 'Funder' is not known or not applicable. Also, spotlight initiative is funded by the EU. If the funder is Spotlight, write Spotlight (and not EU)
      6. Research Organizations (Specify the name of the organizations like Spotlight, UNDP etc.). Do not provide any explanation. Leave blank if 'Funder' is not known or not applicable.
"""

**Do not make any changes in the sections below**

In [None]:
!pip install -U -q google-generativeai # Install the Python SDK

In [None]:
!pip install PyMuPDF

In [None]:
import google.generativeai as genai

import requests
from bs4 import BeautifulSoup
import csv
import os
import sys
import fitz
import json
import re
import time

from google.api_core.exceptions import TooManyRequests
from urllib.parse import urlparse
from google.colab import userdata
# Import the API key from config.py
from config import GOOGLE_API_KEY

# Configure genai with the imported API key
genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
def download_pdf(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

In [None]:
def pdf_to_text(pdf_file):
    try:
        # Open the provided PDF file
        document = fitz.open(pdf_file)

        # Initialize an empty string to store the text
        text = ""

        # Iterate through each page in the document
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text += page.get_text()

        # Close the document
        document.close()
        return text

    except FileNotFoundError:
        print(f"Error: File '{pdf_file}' not found.")
        return None

    except Exception as e:
        print(f"Error: An unexpected error occurred: {str(e)}")
        return None

In [None]:
def extract_file_id(url):
    # Regular expression to match the file ID in the Google Docs URL
    match = re.search(r'/d/([a-zA-Z0-9_-]+)', url)
    # If a match is found, return the file ID
    if match:
        return match.group(1)
    else:
        return None

In [None]:
def obtainfromDocs(file_id):
  url = f'https://docs.google.com/document/d/{file_id}/export?format=txt'
  try:
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for non-200 status codes
    return response.text

  except requests.exceptions.RequestException as e:
    # Handle any request exceptions (including 400 errors)
    print(f'Error downloading from Docs: {e}')
    return None

In [None]:
def obtainfromDrive(drive_link, save_path):
    try:
            # Extracting file ID from Google Drive link
            file_id = drive_link.split("/")[-2]

            # Constructing direct download link for the file
            download_link = f"https://drive.google.com/uc?id={file_id}"

            # Sending a GET request to the direct download link
            response = requests.get(download_link)

            # Saving the PDF file
            with open(save_path, 'wb') as f:
                f.write(response.content)

            print(f"Downloaded {drive_link} to {save_path}")
            pdf_path = save_path

            text = pdf_to_text(pdf_path)

            return text
    except Exception as e:
            print(f"Failed to download {drive_link}: {str(e)}")
            return None


In [None]:
def obtainfromWebpage(url):
    try:

        # Remove leading BOM character if present
        if url.startswith('\ufeff'):
            url = url[1:]

        # Parse the URL and check if it has a scheme
        parsed_url = urlparse(url)
        if not parsed_url.scheme:

            # Prepend https:// as a default scheme if missing
            url = 'https://' + url

        if '.pdf' in url:
            # Call the function to download pdf
            download_pdf(url,'temppdf.pdf')

            # Write content to pdf file
            filename = 'temppdf.pdf'

            print(f"File '{filename}' downloaded successfully")

            pdf_path = filename

            text = pdf_to_text(pdf_path)
            return text



        if 'docs.google' in url:
            # Call the function to handle Google Docs links
            file_id = extract_file_id(url)
            text = obtainfromDocs(file_id)
            return text

        # Requests URL and get response object
        response = requests.get(url)

        # Parse text obtained
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first PDF link on the webpage
        link = soup.find('a', href=lambda href: href and href.endswith('.pdf'))

        if link:
            pdf_url = link.get('href')
            print(f"Downloading file: {pdf_url}")

            # Handle relative URLs
            if not pdf_url.startswith('http'):
                pdf_url = url + pdf_url

            # Get response object for PDF link
            response_pdf = requests.get(pdf_url)

            # Check if the request was successful
            if response_pdf.status_code == 200:
                # Extract filename from URL
                filename = 'temppdf.pdf'

                # Write content to pdf file
                with open(filename, 'wb') as pdf_file:
                    pdf_file.write(response_pdf.content)

                print(f"File '{filename}' downloaded successfully")
                pdf_path = filename
                text = pdf_to_text(pdf_path)
                return text

            else:
                text = obtainfromDrive(url,'temppdf.pdf')
                return text
        else:
            text = obtainfromDrive(url,'temppdf.pdf')
            return text

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None


In [None]:
def datatoCSV(newtext):

    # Assuming the input_string is obtained from the response object
    # Extract the relevant portion as a string
    input_string = newtext

    # Check if the string starts with a valid JSON opening character
    if input_string.startswith('{'):
        # Convert the string to a JSON object
        json_object = json.loads(input_string)

        # Prepare a dictionary to hold flattened values
        flattened_data = {}

        # Flatten the JSON object
        for key, value in json_object.items():
            if isinstance(value, list):
                flattened_data[key] = ', '.join(value)
            else:
                flattened_data[key] = value

        # Define the path to your existing CSV file
        existing_csv_file = 'output.csv'

        # Append the flattened data to the existing CSV file
        with open(existing_csv_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Check if the file is empty (i.e., no header present)
            file.seek(0, 2)  # Move to the end of the file
            file_empty = file.tell() == 0

            # Write the header if the file is empty
            if file_empty:
                writer.writerow(flattened_data.keys())

            # Write a single row of flattened data
            writer.writerow(flattened_data.values())

        print("Data has been successfully appended to", existing_csv_file)
    else:
        print("Invalid JSON format. Skipping this entry.")
        with open('output.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Error'])

In [None]:
def append_to_csv(json_text, csv_filename):
    try:

        # Parse JSON text into a Python dictionary
        data = json.loads(json_text)

        # Extract keys from the first object to use as CSV header
        fieldnames = list(data.keys())

        # Check if the CSV file exists; if not, create it and write headers
        file_exists = True
        try:
            with open(csv_filename, 'r') as f:
                reader = csv.reader(f)
                if not list(reader):  # Check if file is empty
                    file_exists = False
        except FileNotFoundError:
            file_exists = False

        with open(csv_filename, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            if not file_exists:
                writer.writeheader()  # Write headers only if file is empty

            # Write the data as a new row in the CSV file
            writer.writerow(data)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {str(e)}")
        with open('output.csv', 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Error(JSON Decoding Issue)'])


In [None]:
try:
  os.remove('output.csv')
except FileNotFoundError:
  print("Files not found, skipping deletion.")
except Exception as e:
  print(f"Error deleting files: {str(e)}")
with open('input.csv', mode='r') as input_file:
    csv_reader = csv.reader(input_file)
    # Iterate over each row in the input CSV
    for row in csv_reader:
        try:
            os.remove('temppdf.pdf')
            os.remove('temptext.txt')
        except FileNotFoundError:
            print("Files not found, skipping deletion.")
        except Exception as e:
            print(f"Error deleting files: {str(e)}")
        # Assuming the URL is in the first column
        fetch_url = row[0]

        # Check if the fetch_url is '-'
        if len(fetch_url)<5:
            # Write 'Error' to the output CSV and continue to the next iteration
            print("Error encountered")
            with open('output.csv', 'a', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Error (because of -)'])
            continue

        # Download the PDF, convert to text
        text=""
        text = obtainfromWebpage(fetch_url)

        if text:
                print("PDF converted to text successfully!")

                prompt = prompt + text

                model = genai.GenerativeModel('gemini-1.5-flash', generation_config={"response_mime_type": "application/json", "temperature": 0.3})
                try:
                  response = model.generate_content(prompt)

                except TooManyRequests:
                  print("Rate limit exceeded. Retrying in 20 seconds...")
                  time.sleep(20)
                  response = model.generate_content(prompt)

                if response.candidates and response.candidates[0].content.parts:
                    newtext = response.candidates[0].content.parts[0].text
                    append_to_csv(newtext,"output.csv")
                else:
                    with open('output.csv', 'a', newline='') as csvfile:
                          writer = csv.writer(csvfile)
                          writer.writerow(['Error (because of no content parts)'])

        else:
                with open('output.csv', 'a', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow(['Error (text file empty)'])
                print("Failed to convert PDF to text.")


