In [7]:
# chrome://settings/content/popups

In [8]:
import undetected_chromedriver as uc 
import requests
import os
import re
import uuid
import openai
import random
from zenrows import ZenRowsClient
from bs4 import BeautifulSoup
from flask import render_template, Flask
from selenium.webdriver.common.by import By 
from datetime import datetime, timedelta
import time

app = Flask(__name__)

ZEN_APIKEY = 'xxxxx'
OPENAI_KEY = 'xxxxxxx'
openai.api_key = OPENAI_KEY

my_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"

options = uc.ChromeOptions()
options.add_argument(f"--user-agent={my_user_agent}")
options.add_argument("--disable-notifications")
driver = uc.Chrome(options=options, desired_capabilities={"page_load_strategy": "none"})

In [9]:
class AMSOIL:
    def __init__(self):
        self.client = ZenRowsClient(ZEN_APIKEY)
        self.api = "https://api-1.amsoil.com/api"
        self.website_endpoint = "https://www.amsoil.com/lookup/auto-and-light-truck"
        self.headers = {
            'authority': 'api-1.amsoil.com',
            'accept': '*/*',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
            'origin': 'https://www.amsoil.com',
            'referer': 'https://www.amsoil.com/',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.34 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.34',
        }

    def vehicles(self, year):
        params = {
            'lookupCode': 'autoandlighttruck',
            'year': year,
        }

        response = requests.get(os.path.join(self.api, 'Fitment/GetMakes'), params=params, headers=self.headers)

        if response.status_code == 200:
            _vehicles = response.json()
        else:
            _vehicles = None
        return _vehicles 
    
    def models(self, year, vehicle_id):
        params = {
            'lookupCode': 'autoandlighttruck',
            'year': year,
            'equipmentMakeId': vehicle_id,
        }
        response = requests.get(os.path.join(self.api, 'Fitment/GetModels'), params=params, headers=self.headers)

        if response.status_code == 200:
            _models = response.json()
        else:
            _models = None

        return _models
    
    def engines(self, year, vehicle_id, model_id):
        params = {
            'lookupCode': 'autoandlighttruck',
            'year': year,
            'equipmentMakeId': vehicle_id,
            'equipmentModelId': model_id,
        }

        response = requests.get(os.path.join(self.api, 'Fitment/GetEngines'), params=params, headers=self.headers)
        if response.status_code == 200:
            _engines = response.json()
        else:
            _engines = None
        
        return _engines
    
    def vehicle_details(self, equipment_id):
        params = {
            'equipmentUnitId': equipment_id,
        }

        response = requests.get(os.path.join(self.api, 'Fitment/GetVehicleDetails'), params=params, headers=self.headers)

        if response.status_code == 200:
            url = os.path.join(self.website_endpoint, response.json()['url'])
        else:
            url = None
        
        return url
    
    def __get_vis_cap(self, soup):
        data = {}
        div_element = soup.find(lambda tag: tag.name == 'div' and tag.get('id', '').startswith('note'))
        table = div_element.find('table')
        keys = ['viscosity', 'capacity']
        vis_checks = [box.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").text for box in driver.find_element(By.ID, "engine-oila1").find_elements(By.CLASS_NAME, "box")]
        for row in table.find_all('tr'):
            key = row.small.text.replace(":", "").strip()
            if key.lower() in keys:
                value = row.find("td").text.strip()

                # Use regex to extract values
                if key.lower() == 'viscosity':
                    matches = re.findall(r'\b(\d+\s?W-\d+)\b', value)
                    if matches:
                        for _match in list(set(matches)):
                            if len(vis_checks) > 0:
                                for vis_check in vis_checks:
                                    if _match in vis_check:
                                        data['viscosity'] = _match
                            else:
                                data['viscosity'] = _match
                elif key.lower() == 'capacity':
                    match = re.search(r'(\d+\.\d+)\s+quarts', value)
                    match_2 = re.search(r'(\d+)\s+quarts', value)
                    if match:
                        data['capacity'] = match.group(1)+" quarts"
                    elif match_2:
                        data['capacity'] = match_2.group(1)+" quarts"
                    else:
                        data['capacity'] = value
        if 'viscosity' not in data:
            data['viscosity'] = ''
        if 'capacity' not in data:
            data['capacity'] = ''
        return data
    
    def accurate(self, text):
        return text.replace("wear.1", "wear").replace("Mobil 1.3", "Mobile1").replace("changes2", "changes").replace("microns1", "microns")
    
    def __oils(self, driver, url):
        oils = driver.find_element(By.ID, "engine-oila1").find_elements(By.CLASS_NAME, "box")
        engine_oils = []
        for oil in oils[:2]:
            title = oil.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").text
            link = oil.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").get_attribute("href")
            image_url = oil.find_element(By.TAG_NAME, "img").get_attribute("src")
            description = [self.accurate(li.text.strip()) for li in oil.find_element(By.CLASS_NAME, "lguideSummary").find_elements(By.TAG_NAME, "li")]
            description = [random.choice(description)]

            image_folder = os.path.join("articles", "images")
            os.makedirs(image_folder, exist_ok=True)
            name = url[len("https://www.amsoil.com/lookup/auto-and-light-truck")+1:-1].replace("/", "-")
            name = self.rewrite_name(name=name)

            image_name = name+ "-"+ str(uuid.uuid4()).split("-")[-1] + ".png"
            image_path = os.path.join(image_folder, image_name)
            driver.execute_script("window.open('"+image_url+"', '_blank')")
            driver.switch_to.window(driver.window_handles[1])

            with open(image_path, "wb") as file:
                file.write(driver.find_element(By.TAG_NAME, "img").screenshot_as_png)
            
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

            
            engine_oils.append(
                {
                    "title": title,
                    "link": link,
                    "image": f"images/{image_name}",
                    "description": description
                }
            )
        return engine_oils
    
    def __oil_filters(self, driver, url):
        filters = driver.find_element(By.ID, "oil-filtera1").find_elements(By.CLASS_NAME, "box")
        engine_oil_filters = []
        for _filter in filters:
            title = _filter.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").text
            link = _filter.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").get_attribute("href")
            image_url = _filter.find_element(By.TAG_NAME, "img").get_attribute("src")
            description = [self.accurate(li.text.strip()) for li in _filter.find_element(By.CLASS_NAME, "lguideSummary").find_elements(By.TAG_NAME, "li")]
            description = [random.choice(description)]

            image_folder = os.path.join("articles", "images")
            os.makedirs(image_folder, exist_ok=True)
            name = url[len("https://www.amsoil.com/lookup/auto-and-light-truck")+1:-1].replace("/", "-")
            name = self.rewrite_name(name=name)

            image_name = name+ "-"+ str(uuid.uuid4()).split("-")[-1] + ".png"
            image_path = os.path.join(image_folder, image_name)
            driver.execute_script("window.open('"+image_url+"', '_blank')")
            driver.switch_to.window(driver.window_handles[1])

            with open(image_path, "wb") as file:
                file.write(driver.find_element(By.TAG_NAME, "img").screenshot_as_png)
            
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            
            engine_oil_filters.append(
                {
                    "title": title,
                    "link": link,
                    "image": f"images/{image_name}",
                    "description": description
                }
            )
        return engine_oil_filters
    
    def __air_filters(self, driver, url):
        filters = driver.find_element(By.ID, "air-filtera1").find_elements(By.CLASS_NAME, "box")
        engine_air_filters = []
        for _filter in filters:
            title = _filter.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").text
            link = _filter.find_element(By.CLASS_NAME, "plp_list_product_title").find_element(By.TAG_NAME, "a").get_attribute("href")
            image_url = _filter.find_element(By.TAG_NAME, "img").get_attribute("src")
            description = [self.accurate(li.text.strip()) for li in _filter.find_element(By.CLASS_NAME, "lguideSummary").find_elements(By.TAG_NAME, "li")]
            description = [random.choice(description)]

            image_folder = os.path.join("articles", "images")
            os.makedirs(image_folder, exist_ok=True)
            name = url[len("https://www.amsoil.com/lookup/auto-and-light-truck")+1:-1].replace("/", "-")
            name = self.rewrite_name(name=name)

            image_name = name+ "-"+ str(uuid.uuid4()).split("-")[-1] + ".png"
            image_path = os.path.join(image_folder, image_name)
            driver.execute_script("window.open('"+image_url+"', '_blank')")
            driver.switch_to.window(driver.window_handles[1])

            with open(image_path, "wb") as file:
                file.write(driver.find_element(By.TAG_NAME, "img").screenshot_as_png)
            
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

            engine_air_filters.append(
                {
                    "title": title,
                    "link": link,
                    "image": f"images/{image_name}",
                    "description": description
                }
            )
        return engine_air_filters
    
    def __load_page(self, url):
        # response = self.client.get(url=url)
        # if response.status_code == 200:
        #     soup = BeautifulSoup(response.content, "html.parser")
        #     return soup
        # return None
        driver.get(url)
        return BeautifulSoup(driver.page_source, "html.parser")
    
    def __title(self, soup):
        return soup.find("ol", attrs={"class": "breadcrumb"}).find_all("li")[-1].text.strip()
    
    def __extract_engine_code(self, title):
        pattern = r"Engine Code (.+)"
        match = re.search(pattern, title)

        if match:
            engine_code = match.group(1)
            return engine_code.upper()
        else:
            return None
    
    def rewrite_name(self, name):
        return name.split("-code")[0].replace("cyl", "cylinder").replace("l-", "-")+"-oil-type-and-capacity"

    def __make_html(self, url, entities):
        name = url[len("https://www.amsoil.com/lookup/auto-and-light-truck")+1:-1].replace("/", "-")
        name = self.rewrite_name(name=name)
        with app.app_context():
            template = render_template('article.html', **entities)
            with open(f'articles/{name}.html', 'w') as f:
                f.write(template)
                self.save_to_sitemap(
                    url=os.path.join("https://www.xxxxxxxxx.com", "articles", name)
                )

    def get_content(self, title, yvm, question4):
        response = openai.ChatCompletion.create(
            model='gpt-3.5-turbo',
            messages=[
                {'role': 'system', 'content': 'You are a AI Assistant Model that helps in writing article related to vehicles. Give paragraphs only nothing else.'},
                {'role': 'user', 'content': 
                f"""
                Question 1: what are the good things about a {yvm['year']} {yvm['vehicle']}?
                Question 2: {question4}

                Data : {title}

                Answer 1: ....
                Answer 2: ....

                Note: Give both Answers in strictly 80-100 words only in simple english. Always provide answers in the required format.
                """
                }
            ]
        )

        return response['choices'][0]['message']['content']

    def extract_liter_from_title(self, title):
        # Define a regex pattern to match the liter value
        pattern = r'(\d+\.\d+)L'

        # Use re.search to find the pattern in the title
        match = re.search(pattern, title)

        if match:
            # Extract the matched liter value
            liter_value = match.group(1)
            return liter_value+"L"
        else:
            return None
    
    def extract_info_from_url(self, url):
        # Define a regex pattern to match the desired parts of the URL
        pattern = r'https://www\.amsoil\.com/lookup/auto-and-light-truck/(\d+)/([\w-]+)/([\w-]+)/([\w-]+)/'

        # Use re.match to search for the pattern in the URL
        match = re.match(pattern, url)

        if match:
            # Extract the matched groups
            year = match.group(1)
            vehicle = match.group(2)
            model = match.group(3)

            # Create and return a dictionary with the extracted information
            info_dict = {
                "year": year,
                "vehicle": vehicle.title(),
                "model": model.upper()
            }
            return info_dict
        else:
            return None
    
    def extract_cylinders_from_title(self, title):
        # Define a regex pattern to match the number of cylinders
        pattern = r'(\d+)\s*-cyl'

        # Use re.search to find the pattern in the title
        match = re.search(pattern, title)

        if match:
            # Extract the matched number of cylinders
            cylinders = match.group(1)
            return cylinders
        else:
            return None
    
    def make_questions(self, yvm, title, engine_code, cylinders, liter, viscosity, capacity):
        questions = []
        questions.append({
            "question": f"What {random.choice(['type', 'kind'])} of oil does a {yvm['year']} {yvm['vehicle']} {yvm['model']} {liter} engine (Engine Code {engine_code}) {random.choice(['take', 'need', 'require'])} and what is the capacity?",
            "answer": f"The {yvm['year']} {yvm['vehicle']} {yvm['model']} has a {liter} {cylinders}-cylinder engine (Engine Code {engine_code}). The oil type the {yvm['year']} {yvm['vehicle']} {yvm['model']} {liter} engine {random.choice(['requires', 'needs'])} is {viscosity}."
        })
        questions.append({
            "question": f"How much oil does a {yvm['year']} {yvm['vehicle']} take with a {liter} Engine?",
            "answer": f"You will need {capacity} with filter."
        })
        questions.append({
            "question": f"What kind of filter does a {yvm['year']} {yvm['vehicle']} {liter} engine take?",
            "answer": f"Below, you will find the proper filter for your {yvm['year']} {yvm['vehicle']} {yvm['model']}."
        })
        questions.append({
            "question": "What engine is this information for?",
            "answer": f"It is for the {liter} engine (Engine Code {engine_code})."
        })
        return questions
    
    def rewrite_title(self, title):
        return title.split("-cyl")[0].strip()+"-cylinder Oil Type and Capacity"
    
    def format_date(self, date_obj):
        # Format the date as "Month Day, Year"
        formatted_date = date_obj.strftime("%B %d, %Y")
        return formatted_date
    
    def save_to_sitemap(self, url, file_path="sitemap.txt"):
        try:
            with open(file_path, "a", encoding="utf-8") as sitemap_file:
                sitemap_file.write(url + "\n")
            print(f"URL '{url}' saved to '{file_path}' successfully.")
        except Exception as e:
            print(f"Error saving URL to '{file_path}': {e}")

    def extract_answers(self, text):
        answer1_pattern = r"Answer 1:(.*?)Answer 2:"
        answer2_pattern = r"Answer 2:(.*?)$"

        answer1_match = re.search(answer1_pattern, text, re.DOTALL)
        answer2_match = re.search(answer2_pattern, text, re.DOTALL)

        if answer1_match and answer2_match:
            answer1_content = answer1_match.group(1).strip()
            answer2_content = answer2_match.group(1).strip()

            result = {
                "Answer1": answer1_content,
                "Answer2": answer2_content
            }
            return result
        return None

    def get_metadata(self, url):
        soup = self.__load_page(url=url)
        if soup != None:
            print("Soup is ready!")
            try:
                title = self.__title(soup=soup)
            except:
                soup = self.__load_page(url=url)
                title = self.__title(soup=soup)
            vis_cap = self.__get_vis_cap(soup=soup)
            engine_code = self.__extract_engine_code(title=title)
            oils = self.__oils(driver=driver, url=url)
            oil_filters = self.__oil_filters(driver=driver, url=url)
            air_filters = self.__air_filters(driver=driver, url=url)
            cylinders = self.extract_cylinders_from_title(title=title)
            liter = self.extract_liter_from_title(title=title)
            yvm = self.extract_info_from_url(url=url)
            try:
                capacity = driver.find_element(By.XPATH, '//img[contains(@alt, "Vehicle Capacity")]').find_element(By.XPATH, "..").text.split(":")[-1].strip().lower()
            except:
                capacity = vis_cap['capacity']
            print("Using ChatGPT")
            q1 = f"What else can i do to maintain my {yvm['year']} {yvm['vehicle']} {yvm['model']}?"
            q2 = f"What maintenance tips are there for my {yvm['year']} {yvm['vehicle']}?"
            q3 = 'What should I do if my vehicle is consuming oil?'
            question4 = random.choice([q1, q2, q3])
            while True:
                try:
                    content = self.get_content(title=title, yvm=yvm, question4=question4)
                    answers = self.extract_answers(content)
                    break
                except Exception as e:
                    print(e)
                    time.sleep(10)
                    print("Waiting for 10 seconds ......")
            questions = self.make_questions(
                    yvm=yvm,
                    title=title,
                    engine_code=engine_code,
                    cylinders=cylinders,
                    liter=liter,
                    viscosity=vis_cap['viscosity'],
                    capacity=capacity
            )
            questions.append(
                {
                    "question": question4,
                    "answer": answers['Answer2']
                }
            )
            question_one = questions[0]
            print("Making HTML")
            self.__make_html(url=url, entities={
                "date": self.format_date(date_obj=datetime.now() + timedelta(days=1)),
                "oils": oils,
                "oil_type": vis_cap['viscosity'],
                "htitle": self.rewrite_title(title=title),
                "filters": oil_filters + air_filters,
                "questions": questions[1:],
                "question_one": question_one,
                "content": answers['Answer1'],
                "vehicle": f"{yvm['year']} {yvm['vehicle']}",
                "veh_lit": f"{yvm['year']} {yvm['vehicle']} {liter} Engine",
                "veh_enj": f"{yvm['year']} {yvm['vehicle']} {cylinders} cylinders Engine"
            })
            return True
        return None

In [10]:
amsoil = AMSOIL()

In [11]:
with open("2023.txt", "r") as file:
    urls = file.readlines()
    urls = [url.strip()+"/" for url in urls]

In [None]:
import time

for url in urls[600:]:
    start = time.time()
    amsoil.get_metadata(url=url)
    print(f"Time Taken: {time.time() - start}")
    time.sleep(15)
# driver.quit()