## Importing Dependencies

In [1]:
import requests
from htmldate import find_date
from datetime import datetime
import requests
import re
from bs4 import BeautifulSoup
import ssl
from tqdm import tqdm
from textblob import TextBlob
import openai
import multiprocessing
from time import time

from html_extractor import *
from get_suburls import *
from openai_func import *
from get_date import *
from parallel import *

from time import sleep

from keyword_extraction import keyword_extractor_paragraph as kep

## Extracting sub urls

In [2]:
urls_list = ["https://www.khaleejtimes.com" , "https://www.indiatoday.in"]
urls_list_str = ",".join(urls_list)

keywords = "gaza,israel,hamas,idf"

scraper = WebScraper2(sub_url_size = 3 , keywords = keywords)
                        # Integration with DB will make it faster in future, as fetching is much faster than scrapping.
inside_urls, failed_fetch, sub_url_size, total_size = scraper.get_suburls2(urls_list_str)

# print("Inside URLs:", inside_urls)
print("Failed Fetch:", failed_fetch)
print("Splits:", len(inside_urls))
print("Tree size:", total_size)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:01<00:00,  1.68it/s]
100%|██████████| 5/5 [00:06<00:00,  1.29s/it]
100%|██████████| 66/66 [00:23<00:00,  2.79it/s]


Failed Fetch: 0
Splits: 4
Tree size: 181


## Joining sub urls into one single list

In [3]:
website_urls = [item for sublist in list(inside_urls.values()) for item in sublist]
print(len(website_urls))


181


## Testing Date Fetching

Need to integrate Mongo 

In [4]:
fetch_date_from_url(website_urls[3])

('https://www.indiatoday.in/topic/israel', '12-09-2019')

## Creating {url : html content} dictionary

In [4]:
url_html_extracted = get_html(website_urls)
# url_html_extracted

100%|██████████| 181/181 [00:59<00:00,  3.03it/s]


## Keyword extraction performed on above dictionary

In [5]:
url_extracted_html = kep(website_content = url_html_extracted[0], keywords = keywords, filter_by_amount = 60)

# url_extracted_html

100%|██████████| 118/118 [00:00<00:00, 8663.04it/s]


## Converting dictionary to list of tuple pairs, for implementation of batches

In [6]:
content_list = [(key,value[:2000]) for key, value in url_extracted_html.items()] # 1000 is temporary until tokenier function is not set up
# content_list

## Cutting the above list fo batches of batch size MAX_CONTENT

In [7]:
MAX_CONTENT = 5

content_list_complete = []

iterations = len(content_list) // MAX_CONTENT


for i in range(iterations):
    sub_content_list = content_list[MAX_CONTENT * i: MAX_CONTENT * (i + 1)]
    content_list_complete.append(sub_content_list)

# Handle remaining elements after the loop
remaining_elements = content_list[MAX_CONTENT * iterations:]
if remaining_elements:
    iterations += 1
    content_list_complete.append(remaining_elements)

len(content_list_complete)

24

## Openai's api

### Non parallel execution of 1 api

In [21]:
from time import sleep
start = time()

question = "Summary of situation in gaza"

response_complete = ''
for data_idx in range(10):

    prompt = f""" 
        Data is in the form of tuples inside list: {content_list_complete[data_idx]} \n\n\n 
        Question: {question} \n\n\n
        Method of reply: 100 - 200 word sentences, clear reply,
        provide url if neccessary.
        """
    
    if data_idx % 6 == 0:
        sleep(20)

    response = get_completion(prompt)
    response_complete += response + "\n\n"
    print(f"Batch {data_idx + 1} out of {iterations} completed ")

end = time()

print(f"Executed in {end-start:.2f}s")
    

Batch 1 out of 24 completed 
Batch 2 out of 24 completed 
Batch 3 out of 24 completed 
Batch 4 out of 24 completed 
Batch 5 out of 24 completed 
Batch 6 out of 24 completed 
Batch 7 out of 24 completed 
Batch 8 out of 24 completed 
Batch 9 out of 24 completed 
Batch 10 out of 24 completed 
Executed in 216.86s


### Parallel Execution for 2 api

In [18]:
start = time()

complete_result_of_openai = ""
question_to_pass = "status of war in gaza"

count = 0

NUM_OF_API = 2

for i in range(0, 10, NUM_OF_API):

    if count % 6 == 0:
        sleep(20)
    

    result = ''

    result_queue1 = multiprocessing.Queue()
    result_queue2 = multiprocessing.Queue()


    process1 = multiprocessing.Process(target=gpt1, args=(question_to_pass, content_list_complete, i, result_queue1))
    process2 = multiprocessing.Process(target=gpt2, args=(question_to_pass, content_list_complete, i+1, result_queue2))


    # Start processes
    process1.start()
    process2.start()

    # Wait for processes to finish
    process1.join()
    process2.join()


    result1 = result_queue1.get()
    result2 = result_queue2.get()

    # Rest of your code remains unchanged
    result = result1 + "\n\n" + result2 + "\n\n"
    complete_result_of_openai += result

    print(f"Batch {i+1} - {i+NUM_OF_API} executed out of {len(content_list_complete)}")
    count += 1
complete_result_of_openai

end = time()

print(f"Executed in {end-start:.2f}s")


Batch 1 - 2 executed out of 24
Batch 3 - 4 executed out of 24
Batch 5 - 6 executed out of 24
Batch 7 - 8 executed out of 24
Batch 9 - 10 executed out of 24
Executed in 98.97s


In [22]:
216/98

2.204081632653061

In [17]:
response2 = get_completion(f"Provide Detailed Summary of {complete_result_of_openai}")
# response3

In [23]:
with open("Output_gaza_parallel.txt" , "w") as f:
    f.write(complete_result_of_openai)

In [18]:
with open("Output_gaza_parallel_summary.txt" , "w") as f:
    f.write(complete_result_of_openai)

In [63]:
with open("Output_gaza.txt" , "w") as f:
    f.write(response_complete)

In [66]:
with open("Output_gaza_summary.txt" , "w") as f:
    f.write(response2)

In [39]:
params = {
    "q": "cat",
    "api_key": 'd7555dcb6698cf4004b757eeade1ac86a425b101cb972691dc757282f353d9f6',
    "num": 1
}
search = GoogleSearch(params)
search.get_dict()

{'search_metadata': {'id': '65e3f179ca968fce70b728f5',
  'status': 'Success',
  'json_endpoint': 'https://serpapi.com/searches/a1f806d995d2e248/65e3f179ca968fce70b728f5.json',
  'created_at': '2024-03-03 03:41:45 UTC',
  'processed_at': '2024-03-03 03:41:45 UTC',
  'google_url': 'https://www.google.com/search?q=cat&oq=cat&num=1&sourceid=chrome&ie=UTF-8',
  'raw_html_file': 'https://serpapi.com/searches/a1f806d995d2e248/65e3f179ca968fce70b728f5.html',
  'total_time_taken': 1.17},
 'search_parameters': {'engine': 'google',
  'q': 'cat',
  'google_domain': 'google.com',
  'num': '1',
  'device': 'desktop'},
 'search_information': {'query_displayed': 'cat',
  'organic_results_state': 'Results for exact spelling'},
 'knowledge_graph': {'title': 'About Cat',
  'type': 'Animal',
  'kgmid': '/m/01yrx',
  'knowledge_graph_search_link': 'https://www.google.com/search?kgmid=/m/01yrx&hl=en-US&hl=Cat',
  'serpapi_knowledge_graph_search_link': 'https://serpapi.com/search.json?device=desktop&engine=g

In [2]:
from serpapi import GoogleSearch

from_date = '2019-02-22'
to_date = '2024-03-03'

params = {
  "q": 'Ambani Anant Ambani marriage site:http://indiatoday.in OR site:http://timesofindia.indiatimes.com',
  "tbm": "nws",
  "api_key": "d7555dcb6698cf4004b757eeade1ac86a425b101cb972691dc757282f353d9f6",
  "tbs": f"cdr:1,cd_min:{from_date.replace('-', '/')},cd_max:{to_date.replace('-', '/')}",
  'num' : 100
}

search = GoogleSearch(params)
results = search.get_dict()
news_results = results["news_results"]
news_results

KeyError: 'news_results'

In [3]:
results

{'search_metadata': {'id': '65e3fcbf477c0e45c476fabf',
  'status': 'Success',
  'json_endpoint': 'https://serpapi.com/searches/1aaeaac6d67b6acc/65e3fcbf477c0e45c476fabf.json',
  'created_at': '2024-03-03 04:29:51 UTC',
  'processed_at': '2024-03-03 04:29:51 UTC',
  'google_url': 'https://www.google.com/search?q=Ambani+Anant+Ambani+marriage+site%3Ahttp%3A%2F%2Findiatoday.in+OR+site%3Ahttp%3A%2F%2Ftimesofindia.indiatimes.com&oq=Ambani+Anant+Ambani+marriage+site%3Ahttp%3A%2F%2Findiatoday.in+OR+site%3Ahttp%3A%2F%2Ftimesofindia.indiatimes.com&num=100&tbm=nws&tbs=cdr:1,cd_min:2019/02/22,cd_max:2024/03/03&sourceid=chrome&ie=UTF-8',
  'raw_html_file': 'https://serpapi.com/searches/1aaeaac6d67b6acc/65e3fcbf477c0e45c476fabf.html',
  'total_time_taken': 0.83},
 'search_parameters': {'engine': 'google',
  'q': 'Ambani Anant Ambani marriage site:http://indiatoday.in OR site:http://timesofindia.indiatimes.com',
  'google_domain': 'google.com',
  'num': '100',
  'device': 'desktop',
  'tbm': 'nws',
 