In [42]:
!pip install requests beautifulsoup4 selenium pandas
!apt-get update
!apt install -y chromium-chromedriver
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install groq
!pip install youtube_transcript_api



# Scrapper Module

In [111]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Initialize spaCy for text processing
nlp = spacy.load("en_core_web_sm")

class Scraper:
    def __init__(self, query):
        self.query = query
        self.search_url = f"https://www.shine.com/job-search/{query.replace(' ', '-')}-jobs?q={query.replace(' ', '-')}"
        self.driver = self.setup_selenium()

    def setup_selenium(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Configure WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        return driver

    def shine_job_search(self):
        self.driver.get(self.search_url)
        time.sleep(2)

        try:
            # Wait until job cards are loaded
            job_cards = WebDriverWait(self.driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "jobCard_jobCard__jjUmu"))
            )

            if job_cards:
                # Get the first job card
                job_card = job_cards[0]

                try:
                    # Scroll job card into view and wait for it to be clickable
                    self.driver.execute_script("arguments[0].scrollIntoView(true);", job_card)
                    WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable(job_card))

                    # Click the job card to view the details
                    self.driver.execute_script("arguments[0].click();", job_card)

                    # Wait for job description to load
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "jobDetail_jsrpRightDetail_text__jqs8a"))
                    )

                    # Fetch the updated page source
                    page_source = self.driver.page_source
                    soup = BeautifulSoup(page_source, "html.parser")

                    # Extract the job description
                    description_element = soup.find("div", class_="jobDetail_jsrpRightDetail_text__jqs8a")
                    description = description_element.get_text(strip=True) if description_element else "No description available"

                    return description

                except Exception as e:
                    print(f"Error while fetching job description: {str(e)}")
                    return None

            else:
                print("No job cards found.")
                return None

        except Exception as e:
            print(f"Error during search: {str(e)}")
            return None

        finally:
            self.driver.quit()

class TextProcessor:
    @staticmethod
    def clean_text(description):
        important_words = []
        doc = nlp(description.lower())  # Convert text to lowercase

        # Remove stop words and non-skill related words
        for token in doc:
            # Only keep nouns, adjectives, and verbs (likely to be skills or relevant words)
            if token.pos_ in ['NOUN', 'ADJ', 'VERB'] and token.text not in STOP_WORDS:
                important_words.append(token.text)

        return important_words

class JobSearchModule:
    def __init__(self, query):
        self.query = query

    def get_job_description(self):
        scraper = Scraper(self.query)
        description = scraper.shine_job_search()
        return description

    def process_job_description(self, description):
        if description:
            text_processor = TextProcessor()
            important_words = text_processor.clean_text(description)
            return ", ".join(important_words)
        else:
            return "No description available"

# Example usage:
if __name__ == "__main__":
    job_query = " designer"
    job_module = JobSearchModule(job_query)

    # Get the job description
    description = job_module.get_job_description()

    # Process the description to extract important words
    processed_text = job_module.process_job_description(description)

    print(processed_text)


remote, position, freelance, ux, designer, procreator, forefront, design, innovation, role, involve, crafting, user, centered, designs, compelling, seamless, work, team, short, term, projects, contributing, creation, impactful, digital, products, key, responsibilities, design, excellence, develop, wireframes, prototypes, high, fidelity, visual, designs, pleasing, user, friendly, project, collaboration, work, house, design, team, bring, projects, life, ensuring, design, solutions, align, clients, vision, goals, innovation, stay, updated, latest, design, trends, tools, technologies, bring, fresh, ideas, project, quick, turnaround, deliver, high, quality, design, assets, agreed, timelines, maintaining, flexibility, adapt, project, changes, requirements, experience, minimum, years, experience, design, portfolio, demonstrates, ability, create, engaging, digital, experiences, proficiency, strong, skills, figma, design, tools, familiarity, research, methodologies, design, systems, plus, commu

# LLAMA template Module


In [120]:
import os
from groq import Groq
import json
class LlamaChatTemplates:
    def __init__(self, api_key_file="/content/groq_api.txt"):
        # Load the API key from a file if provided, otherwise use environment variable
        self.api_key = self.load_api_key(api_key_file) or os.getenv("GROQ_API_KEY")

        if not self.api_key:
            raise ValueError("API key must be provided either in the environment or in the API key file.")

        # Initialize the client
        self.client = Groq(api_key=self.api_key)

        # Dictionary to store different templates
        self.templates = {}

    def load_api_key(self, api_key_file):
        """Load the API key from a text file (useful for Colab environments)"""
        if api_key_file and os.path.exists(api_key_file):
            with open(api_key_file, 'r') as file:
                return file.read().strip()
        return None

    def add_template(self, template_name, content):
        """Add a template to the template dictionary."""
        self.templates[template_name] = content

    def get_template(self, template_name):
        """Retrieve a template by name."""
        return self.templates.get(template_name, None)

    def create_chat_completion(self, template_name, user_content):
        """Use a specified template to create a chat completion."""
        # Retrieve the template
        template = self.get_template(template_name)
        if not template:
            raise ValueError(f"Template '{template_name}' not found.")

        # Create the chat completion using the template
        chat_completion = self.client.chat.completions.create(
            messages=[
                {"role": "system", "content": template},
                {"role": "user", "content": user_content},
            ],
            model="llama-3.1-70b-versatile",
        )

        # Return the response content
        return chat_completion.choices[0].message.content

    def parse_output(self, raw_output):
        """Parse the raw output (usually a dictionary or JSON string) into a structured format."""
        try:
            # Try parsing as JSON if the output is a string (typically this is a dictionary format)
            parsed_output = json.loads(raw_output)
            return parsed_output
        except json.JSONDecodeError:
            # If the output isn't JSON, you can apply other custom parsing logic here
            return {"error": "Unable to parse output"}

# Example Usage:
if __name__ == "__main__":

    # Initialize the LlamaChatTemplates class
    llama_chat = LlamaChatTemplates()

    # Add a template (this can be done once and reused multiple times)
    template = """
    You are a highly skilled professional in identifying and categorizing relevant skills from a given list.
    Given the list of words, identify all relevant skills and categorize them under the appropriate skill type.
    The skill types include: 'Technical Skills', 'Design Skills', 'Soft Skills', 'Interpersonal Skills'.
    Return the skills in a clean **dictionary format** where each key is a skill type (e.g., 'Technical Skills')
    and the value is a list of corresponding skills. Only return the dictionary object with no additional text,
    explanations, or descriptions. no backtick, no word before '{' this bracket and after '}' this bracket
    """

    llama_chat.add_template("skill_categorization", template)

    # Use the template with processed text
    job_query = "UI UX designer"
    job_module = JobSearchModule(job_query)
    description = job_module.get_job_description()

    processed_text = job_module.process_job_description(description)

    output = llama_chat.create_chat_completion("skill_categorization", processed_text)
    #parsing output from json format
    parsed_output = llama_chat.parse_output(output)

    # Print the output
    print(parsed_output)


{'Technical Skills': ['figma', 'design tools', 'wireframes', 'prototypes', 'ux', 'ui', 'design systems', 'research methodologies'], 'Design Skills': ['crafting', 'user centered designs', 'compelling designs', 'seamless designs', 'visual designs', 'user friendly designs', 'digital products', 'design excellence'], 'Soft Skills': ['adaptability', 'creativity', 'innovation', 'passion', 'keen eye for detail', 'flexibility', 'ability to manage multiple projects'], 'Interpersonal Skills': ['excellent communication skills', 'articulate design concepts', 'collaboration', 'teamwork', 'ability to work with clients']}


In [118]:
llama_chat = LlamaChatTemplates()

techstack = ", ".join(parsed_output['Technical Skills'])

template_techstack_test = """
    You are a highly skilled professional in generating quiz questions based on a given text.
    From the provided text, create a quiz question related to the mentioned tech stack and provide a list of options for the question.
    The correct answer should be identified, and the difficulty level of the question should be categorized as 'Easy', 'Medium', or 'Hard'.
    Return the results in a clean **JSON format** where the keys are:
    - 'question': The question related to the tech stack in the text.
    - 'option': A list of possible options for the question.
    - 'answer': The correct answer from the options.
    - 'difficulty': The level of difficulty for the question.
    Do not include any additional text or explanations, just return the JSON object. no backtick, no word before '{' this bracket and after '}' this bracket
"""
llama_chat.add_template("skill_test", template_techstack_test)

output = llama_chat.create_chat_completion("skill_test", techstack)
parsoutput = llama_chat.parse_output(output)

{
    "question": "What tool is commonly used for creating wireframes, prototypes, and high fidelity visual designs?",
    "option": [
        "Adobe XD",
        "Figma",
        "Sketch",
        "InVision"
    ],
    "answer": "Figma",
    "difficulty": "Easy"
}
{'question': 'What tool is commonly used for creating wireframes, prototypes, and high fidelity visual designs?', 'option': ['Adobe XD', 'Figma', 'Sketch', 'InVision'], 'answer': 'Figma', 'difficulty': 'Easy'}


# Video_Transcript module

In [138]:
import requests
import isodate
from youtube_transcript_api import YouTubeTranscriptApi

def get_best_video(query):
    """
    Fetches the best video for the given search query based on views and duration.

    Args:
        query (str): The search query.

    Returns:
        dict: Details of the best video including title, link, views, duration, and language.
    """
    api_key = "AIzaSyAtStIA5edlw87Br46xbiaB8K9RrNW6_6I"
    search_url = f"https://www.googleapis.com/youtube/v3/search?key={api_key}&q={query}&part=snippet&type=video&maxResults=10"
    best_video = None
    best_score = -1

    try:
        response = requests.get(search_url)
        data = response.json()
        videos = data.get("items", [])

        for video in videos:
            video_id = video["id"]["videoId"]
            video_details_url = f"https://www.googleapis.com/youtube/v3/videos?key={api_key}&id={video_id}&part=contentDetails,snippet,statistics"

            details_response = requests.get(video_details_url)
            details_data = details_response.json()

            if not details_data["items"]:
                continue

            video_info = details_data["items"][0]
            duration_iso = video_info["contentDetails"]["duration"]
            duration_seconds = isodate.parse_duration(duration_iso).total_seconds()
            video_language = video_info["snippet"].get("defaultLanguage", "")
            views = int(video_info["statistics"].get("viewCount", 0))

            if duration_seconds > 900 and video_language.lower() == "en":
                score = views
                if score > best_score:
                    best_score = score
                    best_video = {
                        "video_id": video_id,
                        "title": video_info["snippet"]["title"],
                        "link": f"https://www.youtube.com/watch?v={video_id}",
                        "views": views,
                        "duration_seconds": duration_seconds,
                        "language": video_language,
                    }

    except requests.RequestException as e:
        print("Error fetching data:", e)

    return best_video

def get_transcript(video_id):
    """
    Fetches the full transcript of a YouTube video.

    Args:
        video_id (str): The ID of the YouTube video.

    Returns:
        str: The full transcript of the video (first 1 minute for prototype).
    """
    try:
        # Fetch transcript for the entire video
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Get transcript for the first minute (60 seconds)
        partial_transcript = [
            item['text'] for item in transcript if item['start'] <= 1800
        ]

        return " ".join(partial_transcript)

    except Exception as e:
        return f"Error fetching transcript: {e}"

if __name__ == "__main__":
    search_query = input("Enter the course name or topic: ")
    best_video = get_best_video(search_query)

    if best_video:
        print(f"\nBest Video Found:\nTitle: {best_video['title']}\nLink: {best_video['link']}\nViews: {best_video['views']}\nDuration: {best_video['duration_seconds']} seconds\nLanguage: {best_video['language']}")

        # Fetch transcript for the first 1 minute of the video
        transcript = get_transcript(best_video["video_id"])
        if "Error" not in transcript:
            print("\nTranscript:")
            print(transcript)
        else:
            print(transcript)
    else:
        print("No suitable video found.")


Enter the course name or topic: html

Best Video Found:
Title: HTML & CSS Full Course - Beginner to Pro
Link: https://www.youtube.com/watch?v=G3e-cpL7ofc
Views: 13259332
Duration: 23484.0 seconds
Language: en

Transcript:
Welcome to the complete HTML
and CSS course. In this course, we're going to learn how to build
websites from a beginner to a professional level, and by the end of this course,
we're going to build youtube.com. Now you don't need any previous
coding or technical experience. This course is designed to be your first
step to becoming a software engineer. We're going to start from the very basics
of HTML and CSS and build our way up step by step. And along the way, we're going to learn all the major skills
that we need to create websites at a professional level. You can find the different sections of
this course below the video here and here. And after each section, I'm going to give you a bunch of
exercises that you can do on your own to practice the skills that
we learne

# Text preprocessing module

In [130]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [131]:
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string


# Initialize the NLP model (you may need to install 'spaCy' and download the model)
nlp = spacy.load('en_core_web_sm')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

class TextPreprocessor:
    def __init__(self):
        # Initialize tokenizer, lemmatizer, stopwords and punctuation
        self.punctuation = string.punctuation

    def clean_text(self, text):
        """
        Preprocess the text to reduce tokens and retain important keywords.

        Args:
            text (str): The input text to be processed.

        Returns:
            str: The cleaned and reduced text.
        """
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if token not in stop_words and token not in self.punctuation]
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

        return ' '.join(lemmatized_tokens)

    def extract_key_phrases(self, text):
        """
        Extract key phrases from the text based on nouns, verbs, and adjectives.

        Args:
            text (str): The input text to extract key phrases from.

        Returns:
            list: A list of important words or phrases.
        """
        doc = nlp(text)

        key_phrases = []
        for token in doc:
            # Add noun, verb, and adjective tokens
            if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
                key_phrases.append(token.lemma_)

        return key_phrases

    def preprocess_for_llm(self, text):
        """
        Preprocess text and extract important words to give a hint about what the user learned.

        Args:
            text (str): The raw input text.

        Returns:
            dict: Contains preprocessed text and key phrases.
        """
        cleaned_text = self.clean_text(text)

        key_phrases = self.extract_key_phrases(text)

        return {
            'cleaned_text': cleaned_text,
            'key_phrases': key_phrases
        }

# Usage Example
if __name__ == "__main__":
    input_text = """
    Welcome to the complete HTML and CSS course. In this course, we're going to learn how to build websites
    from a beginner to a professional level, and by the end of this course, we're going to build youtube.com.
    Now you don't need any previous coding or technical experience. This course is designed to be your first
    step to becoming a software engineer.
    """

    # Initialize the preprocessor
    preprocessor = TextPreprocessor()

    # Preprocess the text
    result = preprocessor.preprocess_for_llm(input_text)

    print("Key Phrases: ", result['key_phrases'])


Cleaned Text:  welcome complete html cs course course 're going learn build website beginner professional level end course 're going build youtube.com n't need previous coding technical experience course designed first step becoming software engineer
Key Phrases:  ['welcome', 'complete', 'html', 'css', 'course', 'course', 'go', 'learn', 'build', 'website', 'beginner', 'professional', 'level', 'end', 'course', 'go', 'build', 'need', 'previous', 'coding', 'technical', 'experience', 'course', 'design', 'first', 'step', 'become', 'software', 'engineer']


In [141]:
llama_chat = LlamaChatTemplates()

learning = ", ".join(result['key_phrases'])

template_tech_test = """
    You are a highly skilled professional in generating quiz questions based on the content provided in a video transcript.
    From the transcript, create a quiz question related to the information presented.
    Provide a list of possible options for the question. The correct answer should be identified, and the difficulty level of the question should be categorized as 'Easy', 'Medium', or 'Hard'.
    Return the results in a clean **JSON format** where the keys are:
    - 'question': The question based on the content of the transcript.
    - 'option': A list of possible options for the question.
    - 'answer': The correct answer from the options.
    - 'difficulty': The level of difficulty for the question.
    Do not include any additional text or explanations, just return the JSON object. no backtick, no word before '{' this bracket and after '}' this bracket
"""
llama_chat.add_template("skill_test", template_tech_test)

output = llama_chat.create_chat_completion("skill_test", learning)
parsoutput = llama_chat.parse_output(output)

In [142]:
parsoutput

{'question': 'What is the primary goal of the complete HTML and CSS course for a beginner?',
 'option': ['To become a software engineer',
  'To learn coding',
  'To build a website',
  'To gain technical experience'],
 'answer': 'To build a website',
 'difficulty': 'Easy'}