# **<span style="color:red"> -Scrapping Data From Naukri.com Website </span>**

In [6]:
import os
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import google.generativeai as genai

# -------------------------------
# 1. Configuration and Setup
# -------------------------------

# Hardcode the API key directly in the code
GEMINI_API_KEY = "AIzaSyBPjFtGzbSok5qFhXQ6AaCMus2AYohgYGo"

# Check if API key is loaded
if not GEMINI_API_KEY:
    raise ValueError("Gemini API key not found. Please set it in the code.")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# Configure Gemini AI
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")



# Set up the Selenium WebDriver using Service and ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Initialize empty lists to store scraped data
CompanyName = []
JobTitle = []
Experience = []
Skills = []





# -------------------------------
# 2. Web Scraping Function
# -------------------------------

def scrape_job_listings(page_number, job_type, url_pattern):
    job_url = url_pattern.format(page_number=page_number)
    
    logger.info(f"Fetching URL: {job_url}")
    
    # Open the webpage
    driver.get(job_url)
    
    # Wait for the page to load
    time.sleep(5)
    
    # Get the page source
    page_content = driver.page_source
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(page_content, "html.parser")
    
    # Find all job listing boxes on the page
    boxes = soup.find_all('div', class_="cust-job-tuple layout-wrapper lay-2 sjw__tuple")
    
    logger.info(f"Found {len(boxes)} {job_type} job listings on page {page_number}")
    

    # Extract data from each job listing
    for box in boxes:
        # Extract Company Name
        company = box.find('a', class_='comp-name')
        company_name = company.text.strip() if company else 'N/A'
        CompanyName.append(company_name)
        
        # Extract Job Title
        job = box.find('a', class_='title')
        job_title = job.text.strip() if job else 'N/A'
        JobTitle.append(job_title)
        
        # Extract Experience
        experience = box.find('span', class_='expwdth')
        experience_text = experience.text.strip() if experience else 'N/A'
        Experience.append(experience_text)
        
        # Extract Skills
        skills_list = box.find('ul', class_='tags-gt')
        if skills_list:
            skills = [skill.text.strip() for skill in skills_list.find_all('li')]
            skills_text = ', '.join(skills)
            Skills.append(skills_text)
        else:
            Skills.append('N/A')



# -------------------------------
# 3. Fetch Industry Information Using Gemini
# -------------------------------

def get_industry(company_name):
    if company_name == 'N/A':
        return 'N/A'
    
    prompt = f"What industry does the comapany '{company_name}' belong to?,  no extra details.."

    
    try:
        response = model.generate_content(prompt)
        industry = response.text.strip()
        return industry
    except Exception as e:
        logger.error(f"Error fetching industry for {company_name}: {e}")
        return 'N/A'



def enrich_with_industry(df):
    unique_companies = df['CompanyName'].unique()
    logger.info(f"Unique companies found: {len(unique_companies)}")
    
    company_industries = {}
    
    for company in unique_companies:
        if company in company_industries:
            continue
        
        logger.info(f"Fetching industry for company: {company}")
        industry = get_industry(company)                                 # Function Call
        company_industries[company] = industry
        
        time.sleep(3)  # Adjust based on Gemini API rate limits
    
    df['Industry'] = df['CompanyName'].map(company_industries)
    
    return df



# -------------------------------
# 4. Main Execution
# -------------------------------

def main():
    job_types = {
        "Data Science": "https://www.naukri.com/data-scientist-data-science-jobs-{page_number}?k=data%20scientist%2C%20data%20science&nignbevent_src=jobsearchDeskGNB",
        "Software Engineering": "https://www.naukri.com/software-engineering-jobs-{page_number}?k=software%20engineering&nignbevent_src=jobsearchDeskGNB",
        "Software Testing": "https://www.naukri.com/software-testing-jobs-{page_number}?k=software%20testing&nignbevent_src=jobsearchDeskGNB",
        "Cloud Engineering": "https://www.naukri.com/cloud-engineering-jobs-{page_number}?k=cloud%20engineering&nignbevent_src=jobsearchDeskGNB"
    }
    
    # Define the range of pages you want to scrape manually
    pages_to_scrape = range(1, 21)
    

    # Step 1: Scrape job listings for each job type and page
    for job_type, url_pattern in job_types.items():
        logger.info(f"Starting to scrape {job_type} jobs.")
        for page in pages_to_scrape:
            scrape_job_listings(page, job_type, url_pattern)               # Function Call
    
    # Close the driver after scraping all pages
    driver.quit()
    logger.info("Completed web scraping and closed the browser.")
    

    # Step 2: Create a DataFrame from the scraped data
    job_data = {
        'CompanyName': CompanyName,
        'JobRole': JobTitle,
        'Experience': Experience,
        'Skills': Skills
    }
    
    df = pd.DataFrame(job_data)
    logger.info("Created initial DataFrame.")
    print("Initial DataFrame:")
    print(df.head())  # Display the top rows of the DataFrame for reference
    

    # Step 3: Enrich DataFrame with Industry information
    df = enrich_with_industry(df)                                       # Function Call
    logger.info("Added Industry information to DataFrame.")
    
    print("\nUpdated DataFrame with Industry:")
    print(df.head())  # Display the top rows of the enriched DataFrame
    

    # Step 4: Save the enriched DataFrame to a CSV file
    output_file = 'Data.csv'
    df.to_csv(output_file, index=False)
    logger.info(f"DataFrame saved to '{output_file}'.")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:WebDriver version 129.0.6668.100 selected
INFO:WDM:Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/129.0.6668.100/win32/chromedriver-win32.zip
INFO:WDM:About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/129.0.6668.100/win32/chromedriver-win32.zip
INFO:WDM:Driver downloading response is 200
INFO:WDM:Get LATEST chromedriver version for google-chrome
INFO:WDM:Driver has been saved in cache [C:\Users\janum\.wdm\drivers\chromedriver\win64\129.0.6668.100]
INFO:root:Starting to scrape Data Science jobs.
INFO:root:Fetching URL: https://www.naukri.com/data-scientist-data-science-jobs-1?k=data%20scientist%2C%20data%20science&nignbevent_src=jobsearchDeskGNB
INFO:root:Found 20 Data Science job li

Initial DataFrame:
           CompanyName                                            JobRole  \
0            Angel One  Data Scientist || Expertise in Machine learnin...   
1   Cadient Talent Llp                                     Data Scientist   
2                  IBM            Data Scientist: Artificial Intelligence   
3         HNM Sourcing                                     Data Scientist   
4  Aster DM Healthcare                                     Data Scientist   

  Experience                                             Skills  
0    2-4 Yrs  Data Science, Data Mining, Data Analytics, Mac...  
1    3-6 Yrs  Data analysis, data science, Machine learning,...  
2   5-10 Yrs  python, data analytics, tableau, data visualiz...  
3  10-20 Yrs  algorithms, python, data analytics, natural la...  
4    3-5 Yrs  Predictive Modeling, Machine Learning, Data Sc...  


INFO:root:Fetching industry for company: Cadient Talent Llp
INFO:root:Fetching industry for company: IBM
INFO:root:Fetching industry for company: HNM Sourcing
INFO:root:Fetching industry for company: Aster DM Healthcare
INFO:root:Fetching industry for company: Seventh Contact Hiring Solutions
INFO:root:Fetching industry for company: Trimble
INFO:root:Fetching industry for company: Trimble Applanix
INFO:root:Fetching industry for company: Response Informatics
INFO:root:Fetching industry for company: Bigtapp
INFO:root:Fetching industry for company: IIFL Finance
INFO:root:Fetching industry for company: Arcelormittal
INFO:root:Fetching industry for company: Kevin Process Technologies
INFO:root:Fetching industry for company: Dimiour
INFO:root:Fetching industry for company: Praudyo Solutions Private Limited.
INFO:root:Fetching industry for company: Biz2x
INFO:root:Fetching industry for company: Indusind Bank
INFO:root:Fetching industry for company: Sociante
ERROR:root:Error fetching industry


Updated DataFrame with Industry:
           CompanyName                                            JobRole  \
0            Angel One  Data Scientist || Expertise in Machine learnin...   
1   Cadient Talent Llp                                     Data Scientist   
2                  IBM            Data Scientist: Artificial Intelligence   
3         HNM Sourcing                                     Data Scientist   
4  Aster DM Healthcare                                     Data Scientist   

  Experience                                             Skills  \
0    2-4 Yrs  Data Science, Data Mining, Data Analytics, Mac...   
1    3-6 Yrs  Data analysis, data science, Machine learning,...   
2   5-10 Yrs  python, data analytics, tableau, data visualiz...   
3  10-20 Yrs  algorithms, python, data analytics, natural la...   
4    3-5 Yrs  Predictive Modeling, Machine Learning, Data Sc...   

              Industry  
0  Financial services.  
1      Human Resources  
2           Technology  
