In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import os

In [14]:
def read_html_from_file(path):
    with open(path, 'r', encoding='utf-8', errors='replace') as file:
            content = file.read()
            return content

In [15]:
def parse_job_details(soup):
    jobs = soup.find_all('div', class_='job_seen_beacon')
    job_titles, companies, locations, salaries, job_types, descriptions, dates, links = [], [], [], [], [], [], [], []

    # Loop through each job listing
    for job in jobs:
        # Get job titles
        title_elements = job.find_all('h2', class_=['jobTitle css-198pbd eu4oa1w0', 'jobTitle jobTitle-newJob css-198pbd eu4oa1w0'])
        for h2 in title_elements:
            job_name_span = h2.find('span', id=lambda x: x and x.startswith('jobTitle-'))
            if job_name_span:
                job_name = job_name_span.get('title', job_name_span.text)
                job_titles.append(job_name)

        # Get company and location details
        company_div = job.find('span', {'data-testid': 'company-name'})
        location_div = job.find('div', {'data-testid': 'text-location'})
        if company_div:
            companies.append(company_div.text)
        if location_div:
            locations.append(location_div.text)

        # Get salary and job type details
        pay_div = job.find('div', class_='salary-snippet-container')
        type_div = job.find('div', class_='metadata css-5zy3wz eu4oa1w0')
        salaries.append(pay_div.text.strip() if pay_div else 'Not Provided')
        job_types.append(type_div.find('div', {'data-testid': 'attribute_snippet_testid'}).text.strip() if type_div else 'Not Provided')

        # Get job description
        descr = job.find('div', class_='css-1u8dvic eu4oa1w0')
        if descr:
            descriptions.append(descr.find('ul').text.strip() if descr.find('ul') else '')
        
        # Get job posting date
        date_element = job.find('span', class_='css-qvloho eu4oa1w0')
        if date_element:
            dates.append(date_element.text.strip())

        # Get job link
        a_tag = job.find('h2', class_='jobTitle').find('a') if job.find('h2', class_='jobTitle') else None
        if a_tag:
            links.append("https://www.indeed.com" + a_tag.get('href', ''))

    return job_titles, companies, locations, salaries, job_types, descriptions, dates, links

In [16]:
def save_to_csv(dataframe, file_number):
    output_text = f"Cleaned_csvs/AI-UnitedStates-Page({file_number}).csv"
    dataframe.to_csv(output_text, index=False)

In [22]:
def process_files_in_folder(folder_path):
    # Get a list of all .txt files in the folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for file_name in file_list:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            content = read_html_from_file(file_path)
            soup = BeautifulSoup(content, 'html.parser')
            job_titles, companies, locations, salaries, job_types, descriptions, dates, links = parse_job_details(soup)
            file_number = file_name.split('_')[1].split('.')[0]
            df = pd.DataFrame({
            'Job Title': job_titles,
            'Company': companies,
            'Location': locations,
            'Salary($)': salaries,
            'Job-Type': job_types,
            'Job-Description': descriptions,
            'Raw_Link': links})
            csv_file_name = f'Cleaned_csvs/page_{file_number}.csv'
            df.to_csv(csv_file_name, index=False)
            print(f'Saved {csv_file_name}')

In [23]:
folder_path = '.'
process_files_in_folder(folder_path)

Saved Cleaned_csvs/page_1.csv
Saved Cleaned_csvs/page_10.csv
Saved Cleaned_csvs/page_100.csv
Saved Cleaned_csvs/page_101.csv
Saved Cleaned_csvs/page_11.csv
Saved Cleaned_csvs/page_12.csv
Saved Cleaned_csvs/page_13.csv
Saved Cleaned_csvs/page_14.csv
Saved Cleaned_csvs/page_15.csv
Saved Cleaned_csvs/page_16.csv
Saved Cleaned_csvs/page_17.csv
Saved Cleaned_csvs/page_18.csv
Saved Cleaned_csvs/page_19.csv
Saved Cleaned_csvs/page_2.csv
Saved Cleaned_csvs/page_20.csv
Saved Cleaned_csvs/page_21.csv
Saved Cleaned_csvs/page_22.csv
Saved Cleaned_csvs/page_23.csv
Saved Cleaned_csvs/page_24.csv
Saved Cleaned_csvs/page_25.csv
Saved Cleaned_csvs/page_26.csv
Saved Cleaned_csvs/page_27.csv
Saved Cleaned_csvs/page_28.csv
Saved Cleaned_csvs/page_29.csv
Saved Cleaned_csvs/page_3.csv
Saved Cleaned_csvs/page_30.csv
Saved Cleaned_csvs/page_31.csv
Saved Cleaned_csvs/page_32.csv
Saved Cleaned_csvs/page_33.csv
Saved Cleaned_csvs/page_34.csv
Saved Cleaned_csvs/page_35.csv
Saved Cleaned_csvs/page_36.csv
Saved Cle