Import the required packages

In [1]:
import requests
import re
import io
import os
import PyPDF2
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

Function to get the download link of transcript

In [2]:
def download_transcript(course_url):
    driver = webdriver.Chrome()
    driver.get(course_url)
    wait = WebDriverWait(driver, 10)
    # Go to Downloads pane
    downloads_tab = wait.until(EC.element_to_be_clickable((By.XPATH, "//span[@class='tab'][contains(text(), 'downloads')]")))
    downloads_tab.click()
    # Click on Transcripts
    transcript_option = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[@class='assignments download-type']//h3[text()='Transcripts']")))
    transcript_option.click()
    # CSS selectors to select language
    language_dropdown = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "app-nptel-dropdown .pseudo-input")))
    language_dropdown.click()
    language_options = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "app-nptel-dropdown .pseudo-options li")))
    # Select the "English-Verified" option
    for option in language_options:
        if option.text == "English-Verified":
            option.click()
            break
    # Wait for the download link to become clickable
    download_link = driver.find_element("xpath","//a[contains(@href, 'drive.google.com')]")
    # Get the URL of the download link
    download_url = download_link.get_attribute("href")
    return download_url

Function to obtain the file ID to process the PDF

In [3]:
def extract_file_id(download_url):
    '''Extracts the file id from a Google Drive URL
    
    Arguments:
    drive_url -- the URL of the Google Drive file
    
    Returns:
    The unique identifier of the file
    '''
    # Split the URL by '/' to obtain the file id
    file_id = download_url.split("/")[-2]
    return file_id

Function to scrape the PDF from google drive and save it in .txt file.

In [4]:
def scrape_pdf_from_google_drive(file_id, save_path=None):
    '''Scrapes text from a PDF in Google Drive without downloading it.
    
    Arguments:
    file_id -- the unique identifier of the PDF file in Google Drive
    
    Returns:
    The text content of the PDF file
    '''
    # Replace file_id in the URL to obtain the direct download URL of the PDF file
    pdf_url = f"https://drive.google.com/u/0/uc?id={file_id}"

    # Get the content of the PDF file
    response = requests.get(pdf_url)
    content = response.content
    
    # Wrap the content in a BytesIO object
    pdf_file = io.BytesIO(content)
    
    # Read the content into a PyPDF2 PDF object
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Extract the text from the first `num_pages` pages
    pages = [page.extract_text() for page in pdf_reader.pages]
    pdf_content = "\n".join(pages)   
    # Return the text content of the PDF file
    # return content.decode("latin-1")
    
    if save_path:
        file_name = file_id
        save_path = os.path.join(save_path, file_name + ".txt")
        with open(save_path, "w") as f:
            f.write(pdf_content)
    
    return pdf_content

Calling the Function

In [5]:
# Call the functions
course_url = input("Course URL: ")
download_url = download_transcript(course_url)
file_id = extract_file_id(download_url)
print("File Name : ",file_id)
pdf_content = scrape_pdf_from_google_drive(file_id, save_path="C:/Users/MANOJ/Desktop/AI4B/")

Course URL: https://nptel.ac.in/courses/106106184
File Name :  1wuZcBU6ZkRI9QEOFIbDJjjEPcmiNBPZU


In [6]:
import string
from num2words import num2words

# Define the path to the txt file
file_path = r"C:/Users/MANOJ/Desktop/AI4B/1wuZcBU6ZkRI9QEOFIbDJjjEPcmiNBPZU.txt"

# Read the text from the file
with open(file_path, "r") as file:
    lines = file.readlines()

# First 5 lines of the transcript file doesn't any contains words which are spoken.
# So, we have removed those lines.
text = ''.join(lines[5:])

# Convert all text to lowercase
text = text.lower()

# Remove all punctuations
text = text.translate(str.maketrans("", "", string.punctuation))

# Split the text into words
words = text.split()

# Replace all digits with their spoken form
for i in range(len(words)):
    if words[i].isdigit():
        words[i] = num2words(int(words[i]))

# Join the words back into a sentence
text = " ".join(words)

# Write the modified text back to the file
with open(file_path, "w") as file:
    file.write(text)

print("Text processing complete!")

Text processing complete!
