### Importing all Needed Libraries

In [1]:
# Importing Selenium Related Dependencies
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# For Editing various components of URLs
import time
import re
import random
import os
from datetime import datetime, timedelta
# For Accessing Geminni
import google.generativeai as genai
# For Reminder as task was long
import winsound

# Creating Functions

## Extract Article URLS from TOI Archives

 Urls for articles are present on date by date basic which can be accessed a url that references to a particular date 
 Eg URL- https://timesofindia.indiatimes.com/2011/2/3/archivelist/year-2011,month-2,starttime-40577.cms
 The Basic Structure is Consistent with difference at "/2011/2/3/)" and at "year-2011,month-2,starttime-40577" these can be calculated using python codes for a specific date starttime on 1 jan 2003 is 37622 and increments by 1 for each day these properties can be used to extract date wise URL for links to list of articles on that day

In [2]:
# Here I have created a function that generates the article links archive for a random date between a range
def generate_random_url(reference_date_str="01/01/2003", base_start_time=37622):
    start_date = datetime.strptime("01/01/2003", "%d/%m/%Y")
    end_date = datetime.strptime("31/12/2023", "%d/%m/%Y") #Range of Dates
    
    delta = end_date - start_date
    random_days = random.randrange(delta.days + 1)
    random_date = start_date + timedelta(days=random_days) #Random date and Related Calculation
    
    reference_date = datetime.strptime(reference_date_str, "%d/%m/%Y")
    delta_days = (random_date - reference_date).days
    
    start_time = base_start_time + delta_days
    
    url = f"https://timesofindia.indiatimes.com/{random_date.year}/{random_date.month}/{random_date.day}/archivelist/year-{random_date.year},month-{random_date.month},starttime-{start_time}.cms"
    return url # Returning the generated URL

Now that we Have the Url we can Extract Links of the article from This Urls here we can notice that the structure of each page is similar. The the page contains a lot of random fluff like links to add which can distract our program to handle those we can use the xpath of the table where the relevant links begin to extract links of articles on that date
the article page also contains a lot of ads that could cause error in our extraction for this when doing manually we would click print icon on page that re directs us to a cleaner page that only has headline source date and article this can also be done programatically by altering "articleshow" to "articleshowprint" in articles url 
Eg URL- https://timesofindia.indiatimes.com//life-style/spotlight/when-flamenco-meets-bharatanatyam/articleshow/7354258.cms
to https://timesofindia.indiatimes.com//life-style/spotlight/when-flamenco-meets-bharatanatyam/articleshowprint/7354258.cms
we extract top 5 articles from that table to save time this can be change if needed

In [3]:
def scrape_and_modify_urls(initial_url,driver):
    driver.get(initial_url) # Initial URl is the date URl generated above
    table_section_xpath = '/html/body/div[1]/table[2]/tbody/tr[2]/td[1]/div[3]/table/tbody'
    table_section = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, table_section_xpath))
    ) # I have slow internet hence this autocheck is added
    links = table_section.find_elements(By.TAG_NAME, 'a') # Tag of URl is usually a
    pattern = re.compile(r'http[s]?://.*articleshow/\d+.*') # that Table only contains URLS but a safety check to remove errors
    modified_urls = [] # list to store those article print urls
    for link in links:
        href = link.get_attribute('href')
        if href and pattern.match(href):
            modified_href = href.replace("articleshow", "articleshowprint") #changing for easier extraction in next function of code
            modified_urls.append(modified_href)
            if len(modified_urls) == 5: 
                break
    return modified_urls 

Now that we have URLs of actual articles in correct format we can perform extraction on them. Since it has been observed the format of articles now is consistent we can use xpaths to navigate the each section and extract content from them we return our extracted content in form of a dictionary.
in the function below we have called on two other functions defined later in code to perform the tasks of formatting the date in right format and generate summary of article extracted. we also return a flag that tells us the status of summary and is used to accept or repeat the process.

In [4]:
def extract_content(url,flag,driver):
    driver.get(url) # URL of Articleprint (single element from above list)
    headline = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/section/h1'))
    ).text # waiting for element in headline to load then extract them as text and store. wait to prevent errors
    
    date_element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/section/div'))
    ).text # waiting for element in date to load then extract them as text and store. wait to prevent errors
    
    # we have called a function defined below to format the presented date in correct order for processing
    # flag is an error detection semaphore here will be changed to one if error encountered
    formatted_date,flag = format_date(date_element,flag)
    
    article_content = []
    sections = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//div[contains(@class, "section")]'))
    ) # waiting for element in article to load then extract them as text and store each section in list. wait to prevent errors

    for section in sections: # since article is in different sections we have to combine them
        normal_divs = section.find_elements(By.XPATH, './/div[contains(@class, "Normal")]')
        for div in normal_divs:
            article_content.append(clean_text(div.text)) 
    
    article_text = ' '.join(article_content)

    # calling a generate summary function defined below to generate summary of article and its topic also has a semaphore to test for errors in summary generation
    summary, topic, flag = generate_summary_and_topic(article_text,flag) 
    
    return {
        'headline': headline,
        'date': formatted_date,
        'article': article_text,
        'summary': summary,
        'topic': topic
    }, flag # returns dictionary containing data and flag containing information about correct structure of generated summary

Here we define the summary generation function that uses the Free API of Googles LLM gemini the article is provided as prompts to the model as a summary is asked. Later from this summary, a list of broad topics is asked to be generated. LLMs generally prefer it if a given problem is broken into many parts and are less likely to give errornious response. from the given generated list of broad topics the LLM is asked to which specific topic from given list (present in google form) does this prompt match the most. It is asked with a very specific prompt . If the response doesnt match (response is not in topic list) , we ask it again to rectify (to save us time.). in case the response is still erronious then flag is turned to one the main function detects this and re starts the entire iteration again to prevent submitting errorinous response to form. Your margin to safety may wary.

In [5]:
def generate_summary_and_topic(article_text,flag):
    model = genai.GenerativeModel("gemini-pro") # selecting model from api

    #Summary of article
    prompt = "generate summary of article in strictly50-200 words" + article_text
    response = model.generate_content(prompt,safety_settings=[{'category':'HARM_CATEGORY_HARASSMENT','threshold':'block_none'},{'category':'HARM_CATEGORY_SEXUALLY_EXPLICIT','threshold':'block_none'},{'category':'HARM_CATEGORY_DANGEROUS_CONTENT','threshold':'block_none'},{'category':'HARM_CATEGORY_HATE_SPEECH','threshold':'block_none'}])
    summary_=response.text 

    # Broad topics in artcle
    prompt="Identify the broad topics in this summary"+summary_
    response = model.generate_content(prompt,safety_settings=[{'category':'HARM_CATEGORY_HARASSMENT','threshold':'block_none'},{'category':'HARM_CATEGORY_SEXUALLY_EXPLICIT','threshold':'block_none'},{'category':'HARM_CATEGORY_DANGEROUS_CONTENT','threshold':'block_none'},{'category':'HARM_CATEGORY_HATE_SPEECH','threshold':'block_none'}])
    topic=response.text

    # Actual Topic from List using those Broad Topics
    prompt="which single word from the list ['Politics', 'International News', 'National News', 'Local News', 'Business and Finance', 'Science and Technology', 'Health and Wellness', 'Entertainment', 'Sports', 'Lifestyle and Features', 'Opinion and Editorial', 'Environment', 'Education', 'Crime and Justice', 'Human Interest', 'Obituaries', 'Weather', 'Religion and Spirituality','Technology and Gadgets','Automotive'] does thw word : "+topic+" relate to the most if the word exists in list return it back"
    response=model.generate_content(prompt,safety_settings=[{'category':'HARM_CATEGORY_HARASSMENT','threshold':'block_none'},{'category':'HARM_CATEGORY_SEXUALLY_EXPLICIT','threshold':'block_none'},{'category':'HARM_CATEGORY_DANGEROUS_CONTENT','threshold':'block_none'},{'category':'HARM_CATEGORY_HATE_SPEECH','threshold':'block_none'}])
    topic=response.text
    
    topics=['Politics', 'International News', 'National News', 'Local News', 'Business and Finance', 'Science and Technology', 'Health and Wellness', 'Entertainment', 'Sports', 'Lifestyle and Features', 'Opinion and Editorial', 'Environment', 'Education', 'Crime and Justice', 'Human Interest', 'Obituaries', 'Weather', 'Religion and Spirituality','Technology and Gadgets','Automotive']
    if topic not in topics:
        # Retry to save computation time in case it gives correct answer tgis time it will save us like 10-15 second of initial extraction
        prompt="which word from the list ['Politics', 'International News', 'National News', 'Local News', 'Business and Finance', 'Science and Technology', 'Health and Wellness', 'Entertainment', 'Sports', 'Lifestyle and Features', 'Opinion and Editorial', 'Environment', 'Education', 'Crime and Justice', 'Human Interest', 'Obituaries', 'Weather', 'Religion and Spirituality','Technology and Gadgets','Automotive'] does thw word : "+topic+" relate to the most if the word exists in list return it back"
        response=model.generate_content(prompt,safety_settings=[{'category':'HARM_CATEGORY_HARASSMENT','threshold':'block_none'},{'category':'HARM_CATEGORY_SEXUALLY_EXPLICIT','threshold':'block_none'},{'category':'HARM_CATEGORY_DANGEROUS_CONTENT','threshold':'block_none'},{'category':'HARM_CATEGORY_HATE_SPEECH','threshold':'block_none'}])
        topic=response.text
        
        if topic not in topics:
            flag=1 # semaphore set to 1 to indicate main function the summary generated is erronious and iteration should be terminated for safety
            
    return clean_text(summary_), topic, flag

Now that we have our data in form of a dictionary we can submit it to our google form we will again use selenium library with help of Xpath to identify inputs boxes and send keys to each input box and subit our response modifiy this code according to your needs i have provided screen shots of the form i use also a template to copy it to test for your self i have closed the form dont spam me.

In [6]:
def fill_google_form(d,driver):

    # Acessing the Google Form using URL.
    driver.get("YOUR GOOGLE FORM")
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "input"))) # waiting for loading to avoid errors.

    # Finding the column to Input my name
    myname=driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[1]/div/div[1]/input')
    myname.click()
    myname.send_keys("YOUR NAME")

    # Finding the column to input newspaper name
    newsname=driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/input')
    newsname.click()
    newsname.send_keys("The Times of India")

    # Finding the column to input date
    date_input = driver.find_element(By.CSS_SELECTOR, 'input[type="date"]')
    date_input.send_keys(d['date'])

    # Finding the column to Input the extracted heading
    heading=driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[4]/div/div/div[2]/div/div[1]/div/div[1]/input')
    heading.click()
    heading.send_keys(d['headline'])

    # Finding the column to input Articles content
    content=driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[5]/div/div/div[2]/div/div[1]/div[2]/textarea')
    content.click()
    content.send_keys(d['article'])

    # Finding the Column to Input generated summary
    summary_=driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[6]/div/div/div[2]/div/div[1]/div[2]/textarea')
    summary_.click()
    summary_.send_keys(d['summary'])

    #implementing Radiobuttons loops to find and selet the correct Implemented outside to save runtime.
    # Call an external the function
    radiobutton(d['topic'],driver)

    

In [7]:
def radiobutton(topic,driver):
    radiobuttons = {
        'Politics': 'i30', 'International News': 'i33', 'National News': 'i36', 'Local News': 'i39', 
        'Business and Finance': 'i45', 'Science and Technology': 'i48', 'Health and Wellness': 'i51', 
        'Entertainment': 'i54', 'Sports': 'i57', 'Lifestyle and Features': 'i60', 'Opinion and Editorial': 'i63', 
        'Environment': 'i66', 'Education': 'i69', 'Crime and Justice': 'i72', 'Human Interest': 'i75', 
        'Obituaries': 'i78', 'Weather': 'i81', 'Religion and Spirituality': 'i84', 
        'Technology and Gadgets': 'i87', 'Automotive': 'i90'
    } # dictionary containing all the ID tags of each radiobutton
    for radio in radiobuttons: # for-else implementation to click a button. Note if break is encountered Else is not executed
        if topic == radio:
            button = driver.find_element(By.ID, radiobuttons[radio])
            button.click()
            break
    else:
        other = driver.find_element(By.ID, 'i93')
        other.click()
        ink = driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[7]/div/div/div[2]/div/div/span/div/div[22]/div/span/div/div/div[1]/input')
        ink.send_keys(topic)

Below are some Auxillary function implemented so as to call and use then as needed by main three code functions tasks above

In [8]:
def clean_text(text): # text extracted from article contained a lot of random characters this function deals with those
    text = text.replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9.,;:\'"!?() -]', '', text)
    text = text.replace("\'","'")
    return text

In [9]:
def format_date(date_string, flag ):
    # Remove any text ending with ' | ' from the date_string
    date_string = re.sub(r'^.*? \| ', '', date_string)
    
    date_format_in = "%b %d, %Y, %I.%M %p IST"
    date_format_out = "%m/%d/%Y"
    
    try:
        date_obj = datetime.strptime(date_string, date_format_in)
        formatted_date = date_obj.strftime(date_format_out)
    except ValueError:
        flag=1
        return date_string , flag # once again used to detect
    
    return formatted_date, flag

With this we have defined all the needed functions to run the program now we will write the main function to limit the scope of variables and not make them global to avoid errors.

In [10]:
def main():
    # setup gemini and driver 
    GOOGLE_AI_KEY = 'YOUR API KEY'
    genai.configure(api_key=GOOGLE_AI_KEY)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # count successful submits
    count=0
    # Count error encountered in generated content or date_formatting
    hallucinate=0
    # count anyoerror encountered
    p=0
    # detect Errors and restart the loop
    flag=0
    # Store URL that have cause any form of error for future diagnostics
    error_urls={}
    # number of entries on server
    max_count=1000
    
    while True: # Loop till a stopping condition is reached.
        try:
            initial_url = generate_random_url() # Generating Date archive URL
            
            urls_to_scrape = scrape_and_modify_urls(initial_url,driver) # Generating List of Articles from that Url
            
            url_select=random.choice(urls_to_scrape) # Selecting 1 Random article from list.
            
            content,flag = extract_content(url_select,flag,driver) # ectracting and adding generated content from article
            
            if flag==1: # Error detected
                flag=0 # set Flag to zero for next Iteration
                hallucinate+=1
                error_urls[url_select]="Function Error at Count :"+ str(count) # storing the erronious URL in a dictionary
                driver.quit() # restart the driver for being error free just in case
                driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
                continue
                
            fill_google_form(content,driver) #filling up the google form with details
            
            submit=driver.find_element(By.XPATH,'//*[@id="mG61Hd"]/div[2]/div/div[3]/div[1]/div[1]/div')
            submit.click() # submitting the google form
            
            count+=1 # adding in count of successful submissions

            if count % 50 == 0: # Notify user at every 50 sucessful submissions
                print("Number of submissions done till now:",count)
                winsound.Beep(300,300)
            if count % 100 ==0: # Dump error dictionary into a file every 100 submissions for speed
                with open("Dictionary_dump.txt","a") as file:
                    file.write("Errors till Count:",count,"\n")
                    file.write(str(error_urls)+"\n\n")
                    file.close()
                error_urls.clear()
            if count==max_count: # exit loop when number of submissions reached
                break
        except Exception as e:
            error_urls[url_select]="Error Code:"+str(e)
            p+=1
            driver.quit() # restart the driver for being error free just in case
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
            continue
        driver.quit()
    winsound.Beep(2400,500)
    print("You are Finished")
    print("Number of submit:",count)
    print("Number of General Errors:",p)
    print("Number of Errors due to Undesired output:",hallucinate)
    os.system('shutdown /s /t 1') # shutdown after running overnight
            
                
        

Our code is done Since this Code was Designed to be ran Overnight, to do so, a lot of focus has been put in error management and safehandling of genrated content. now we can reap the fruits of our labour esure you replace place holders like "Doc URL","GOOgle_api key", with correct values according to your personal needs.

In [11]:
if __name__ == "__main__": #calling the main function
    main()

KeyboardInterrupt: 