In [71]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import string
from duckduckgo_search import DDGS
import certifi
from sklearn.feature_extraction.text import TfidfVectorizer
from requests.exceptions import Timeout, RequestException
import numpy as np

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
def fetch_search_results(query, count=10):
    results = DDGS().text(query, max_results=count)
    urls = [result['href'] for result in results]
    return urls

def fetch_webpage(url, timeout_value = 5):
    try:
        response = requests.get(url, timeout=timeout_value, verify=False)
        response.raise_for_status()  # Raise an error for bad HTTP status codes
        if response.status_code == 200:
            return response.text
    except Timeout:
        print(f"The request timed out after {timeout_value} seconds")
        return None
    except RequestException as e:
        print(f"An error occurred: {e}")
        return None

def extract_text_from_webpage(html):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([para.get_text() for para in paragraphs])
    return text

def preprocess_text(text, tfidf=False):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    
    if tfidf:
        return ' '.join(tokens)
    else:
        return tokens


def get_term_frequencies(tokens):
    return Counter(tokens)

def fetch_and_extract_text(urls):
    texts = []
    for url in urls:
        html = fetch_webpage(url)
        if html:
            text = extract_text_from_webpage(html)
            texts.append(text)
    return texts

In [54]:
def extract_common_terms(query, nb_pages = 20, max_features = 100):
    urls = fetch_search_results(query, nb_pages)

    all_text = ""
    for url in urls:
        html = fetch_webpage(url)
        if html:
            text = extract_text_from_webpage(html)
            all_text += text + " "
    
    tokens = preprocess_text(all_text)
    term_frequencies = get_term_frequencies(tokens)
    common_terms = term_frequencies.most_common(max_features)

    file = open('common_terms.txt', 'w') 
    file.write("\n".join([term[0] for term in common_terms])) 
    file.close() 

In [75]:
def extract_common_terms_tfidf(query, file_name='common_terms_tfidf.txt', nb_pages = 20, max_features = 100):
    urls = fetch_search_results(query, nb_pages)
    texts = fetch_and_extract_text(urls)
    
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    texts2 = [preprocess_text(text, True) for text in texts]
    X = vectorizer.fit_transform(texts2)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_matrix = X.toarray()
    aggregated_tfidf = np.sum(tfidf_matrix, axis=0)

    # Create a list of (term, aggregated_tfidf_value) tuples
    term_tfidf_pairs = [(feature_names[i], aggregated_tfidf[i]) for i in range(len(feature_names))]

    # Sort the list by aggregated_tfidf_value in descending order
    sorted_term_tfidf_pairs = sorted(term_tfidf_pairs, key=lambda x: x[1], reverse=True)
    file = open(file_name, 'w') 
    for term, tfidf_value in sorted_term_tfidf_pairs:
        file.write(f"{term},{tfidf_value}\n")
    file.close() 

In [77]:
query = "certification process car brake"
extract_common_terms_tfidf(query, 'common_terms_tfidf.txt', 50, 500)



An error occurred: 403 Client Error: Forbidden for url: https://www.indeed.com/career-advice/career-development/mechanic-certification




An error occurred: 406 Client Error: Not Acceptable for url: https://www.smartautotraining.com/brake-and-lamp-training/




An error occurred: 406 Client Error: Not Acceptable for url: https://mymechanic.net/2021/02/02/the-real-truth-behind-ase-certified-mechanics-and-why-it-matters/




An error occurred: 403 Client Error: Forbidden for url: https://www.brakeandfrontend.com/brakes-101/




An error occurred: 403 Client Error: Forbidden for url: https://www.tripadvisor.co.uk/ShowTopic-g187191-i1308-k7750539-Car_hire-Rouen_Seine_Maritime_Haute_Normandie_Normandy.html
An error occurred: 403 Client Error: Forbidden for url: https://www.tripadvisor.com/ShowTopic-g187179-i607-k7644978-Caen_or_Rouen_for_car_rental-Normandy.html




An error occurred: 403 Client Error: Forbidden for url: https://www.nytimes.com/2024/07/19/technology/microsoft-crowdstrike-outage-what-happened.html




An error occurred: 406 Client Error: Not Acceptable for url: http://all-car.com/full-service-brake-light-inspection-center/




The request timed out after 5 seconds




An error occurred: 403 Client Error: Forbidden for url: https://parmaccs.com/blog/the-definitive-guide-to-proper-brake-repair-and-maintenance/




An error occurred: 403 Client Error: Forbidden for url: https://brakestogo.com/how-brakes-to-go-works/learn-about-brakes/




An error occurred: 403 Client Error: Forbidden for url: https://quizlet.com/815991207/brakes-certification-flash-cards/




An error occurred: 403 Client Error: Forbidden for url: https://www.caranddriver.com/shopping-advice/a15102312/what-you-need-to-know-about-certified-pre-owned-cpo-car-programs-feature/




An error occurred: 403 Client Error: Forbidden for url: https://www.miramesaauto.com/brake-lamp-certification-san-diego/




An error occurred: 403 Client Error: Forbidden for url: https://quizlet.com/610798328/brakes-certification-flash-cards/




An error occurred: 403 Client Error: Forbidden for url: https://www.michigan.gov/sos/all-services/mechanic-testing


In [78]:
urls = fetch_search_results(query, 50)
for url in urls:
    print(url)

https://www.technologyed.com/courses/automotive-brakes-ase-a5-online-training-certification-course/
https://work.chron.com/certified-brake-technician-14798.html
https://www.ase.com/test-series
https://obdforcar.com/get-ase-certified-in-brakes/
https://www.tiresplus.com/auto-repair-services/brake-service/technicians-and-process/
https://www.traininthefastlane.com/training/brake-system-training/
https://www.indeed.com/career-advice/career-development/mechanic-certification
https://www.ase.com/dist/docs/Auto-Study-Guide_.pdf
https://www.smartautotraining.com/brake-and-lamp-training/
https://www.baker.edu/academics/affiliated-institutions/auto-diesel-institute-of-michigan/programs-at-adi/automotive-services-technology-certifcate/
https://www.wagnerbrake.com/technical/garage-gurus.html
https://www.brakeandfrontend.com/brakes-101/
https://www.youtube.com/watch?v=ZB6faF8uiTQ
https://www.caranddriver.com/features/a15089224/certified-pre-owned-cpo-vehicle-inspections-explained-feature/
https://