In [None]:
import pandas as pd
from datetime import datetime
import re
import os

class IndeedJobProcessor:
    def __init__(self):
        # Gendered words lists for analysis
        self.masculine_words = [
            'leader', 'competitive', 'dominant', 'assertive', 'aggressive', 'ambitious',
            'analytical', 'confident', 'decisive', 'determined', 'independent', 'objective',
            'self-reliant', 'strong', 'superior', 'lead', 'manage', 'direct', 'control',
            'drive', 'challenge', 'compete', 'win', 'achieve', 'dominate', 'excel',
            'individual', 'autonomous', 'hierarchy', 'decision', 'responsibility'
        ]
        
        self.feminine_words = [
            'collaborative', 'cooperative', 'supportive', 'nurturing', 'empathetic',
            'interpersonal', 'communicate', 'understand', 'responsible', 'connect',
            'honest', 'loyal', 'dependable', 'committed', 'dedicated', 'support',
            'help', 'assist', 'care', 'share', 'together', 'team', 'community',
            'relationship', 'trust', 'warm', 'kind', 'inclusive', 'collaborate'
        ]

    def clean_text(self, text):
        """Clean and normalize text"""
        if not text:
            return ""
        
        text = ' '.join(text.split())
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def extract_job_id(self, url):
        """Extract job ID from Indeed URL"""
        try:
            # Pattern for Indeed URLs: /viewjob?jk=job_id
            match = re.search(r'jk=([a-zA-Z0-9]+)', url)
            if match:
                return match.group(1)
            # Alternative pattern
            match = re.search(r'/jobs/([^/?]+)', url)
            if match:
                return match.group(1)
            return url.split('/')[-1] or "manual_entry"
        except:
            return "manual_entry"

    def detect_remote_option(self, text):
        """Detect remote work options from text"""
        if not text:
            return "Not specified"
        
        text_lower = text.lower()
        remote_keywords = ['remote', 'work from home', 'telecommute', 'virtual', 'distributed']
        hybrid_keywords = ['hybrid', 'flexible', 'part remote', 'some remote']
        
        for keyword in remote_keywords:
            if keyword in text_lower:
                return "Remote"
        
        for keyword in hybrid_keywords:
            if keyword in text_lower:
                return "Hybrid"
        
        return "On-site"

    def extract_salary_info(self, text):
        """Extract salary information from text"""
        if not text:
            return "Not specified"
        
        salary_patterns = [
            r'£\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s*(?:-|to)\s*£?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
            r'(\d{1,3}(?:,\d{3})*)\s*(?:-|to)\s*(\d{1,3}(?:,\d{3})*)\s*(?:per year|annually|pa|p\.a\.)',
            r'£(\d{1,3}(?:,\d{3})*)', r'(\d{1,3}(?:,\d{3})*)\s*(?:per year|pa|p\.a\.)'
        ]
        
        for pattern in salary_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                if len(match.groups()) >= 2:
                    return f"£{match.group(1)} - £{match.group(2)}"
                else:
                    return f"£{match.group(1)}"
        
        return "Not specified"

    def analyze_gendered_language(self, text):
        """Analyze gendered language in text"""
        if not text:
            return 0, 0, 0, []
        
        text_lower = text.lower()
        words = re.findall(r'\b[a-z]+\b', text_lower)
        
        masculine_count = sum(1 for word in words if word in self.masculine_words)
        feminine_count = sum(1 for word in words if word in self.feminine_words)
        
        gendered_words_found = []
        for word in words:
            if word in self.masculine_words:
                gendered_words_found.append(f"M:{word}")
            elif word in self.feminine_words:
                gendered_words_found.append(f"F:{word}")
        
        total_words = len(words)
        if total_words == 0:
            return 0, 0, 0, gendered_words_found
        
        masculine_score = round((masculine_count / total_words) * 100, 2)
        feminine_score = round((feminine_count / total_words) * 100, 2)
        neutral_score = round(100 - masculine_score - feminine_score, 2)
        
        return masculine_score, feminine_score, neutral_score, gendered_words_found

    def determine_job_level(self, title, description):
        """Determine job level from title and description"""
        combined_text = f"{title} {description}".lower()
        
        senior_keywords = ['senior', 'lead', 'principal', 'head', 'director', 'manager', 'chief']
        junior_keywords = ['junior', 'entry', 'graduate', 'intern', 'trainee', 'assistant']
        
        for keyword in senior_keywords:
            if keyword in combined_text:
                return "Senior"
        
        for keyword in junior_keywords:
            if keyword in combined_text:
                return "Junior"
        
        return "Mid-level"

    def extract_indeed_specific_info(self, description):
        """Extract Indeed-specific information"""
        
        # Job type detection
        job_type = "Full-time"  # Default
        if re.search(r'part.?time', description, re.IGNORECASE):
            job_type = "Part-time"
        elif re.search(r'contract|temporary|temp', description, re.IGNORECASE):
            job_type = "Contract"
        elif re.search(r'internship|intern', description, re.IGNORECASE):
            job_type = "Internship"
        
        # Company size detection
        company_size = "Not specified"
        if re.search(r'startup|start.up', description, re.IGNORECASE):
            company_size = "Startup"
        elif re.search(r'small business|sme', description, re.IGNORECASE):
            company_size = "Small"
        elif re.search(r'large company|corporation|multinational', description, re.IGNORECASE):
            company_size = "Large"
        
        # Benefits detection
        benefits = []
        benefit_keywords = [
            'pension', 'healthcare', 'dental', 'vision', 'insurance',
            'bonus', 'commission', 'stock options', 'equity',
            'vacation', 'holiday', 'pto', 'flexible hours',
            'training', 'development', 'education', 'tuition'
        ]
        
        for benefit in benefit_keywords:
            if benefit in description.lower():
                benefits.append(benefit)
        
        return {
            'job_type': job_type,
            'company_size': company_size,
            'benefits': '; '.join(benefits) if benefits else 'Not specified'
        }

    def get_job_input(self, url):
        """Get job information through user input"""
        print("\n" + "="*60)
        print("INDEED UK JOB DATA ENTRY")
        print("="*60)
        print(f"URL: {url}")
        print("-"*60)
        
        # Basic information
        job_title = input("Job Title: ").strip()
        company_name = input("Company Name: ").strip()
        location = input("Location: ").strip()
        
        # Indeed-specific fields
        job_type = input("Job Type (Full-time/Part-time/Contract/Internship): ").strip()
        salary_displayed = input("Salary (as displayed on Indeed): ").strip()
        posted_date = input("Date Posted (e.g., '2 days ago'): ").strip()
        company_rating = input("Company Rating (if shown): ").strip()
        
        # Job description
        print("\nJob Description:")
        print("(Paste the full job description. Press Enter twice when finished)")
        print("-" * 40)
        
        description_lines = []
        empty_line_count = 0
        
        while empty_line_count < 2:
            try:
                line = input()
                if line.strip() == "":
                    empty_line_count += 1
                else:
                    empty_line_count = 0
                description_lines.append(line)
            except EOFError:
                break
        
        description = '\n'.join(description_lines).strip()
        
        return {
            'job_title': job_title,
            'company_name': company_name,
            'location': location,
            'job_type': job_type,
            'salary_displayed': salary_displayed,
            'posted_date': posted_date,
            'company_rating': company_rating,
            'description': description
        }

    def process_job_data(self, input_data, url):
        """Process the manually entered job data"""
        
        description_cleaned = self.clean_text(input_data['description'])
        
        # Create job data structure
        job_data = {
            'job_id': self.extract_job_id(url),
            'platform': 'Indeed UK',
            'job_url': url,
            'scrape_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'job_title': input_data['job_title'],
            'company_name': input_data['company_name'],
            'location': input_data['location'],
            'job_type': input_data['job_type'] or 'Full-time',
            'salary_displayed': input_data['salary_displayed'],
            'posted_date': input_data['posted_date'],
            'company_rating': input_data['company_rating'],
            'remote_option': self.detect_remote_option(f"{input_data['location']} {description_cleaned}"),
            'job_description_raw': input_data['description'],
            'job_description_cleaned': description_cleaned,
            'salary_extracted': self.extract_salary_info(description_cleaned),
            'industry': 'Operations Research / Analytics',
            'job_level': self.determine_job_level(input_data['job_title'], description_cleaned),
            'word_count': len(description_cleaned.split()) if description_cleaned else 0,
            'manual_review_flag': False,
        }
        
        # Extract Indeed-specific information
        indeed_info = self.extract_indeed_specific_info(description_cleaned)
        job_data.update(indeed_info)
        
        # Analyze gendered language
        masculine_score, feminine_score, neutral_score, gendered_words = self.analyze_gendered_language(description_cleaned)
        
        job_data.update({
            'masculine_score': masculine_score,
            'feminine_score': feminine_score,
            'neutral_score': neutral_score,
            'gendered_words_found': '; '.join(gendered_words) if gendered_words else ''
        })
        
        # Set manual review flag
        if (job_data['word_count'] < 20 or 
            not job_data['job_title'] or 
            not job_data['company_name']):
            job_data['manual_review_flag'] = True
        
        return job_data

    def save_to_excel(self, job_data, output_path, append_mode=False):
        """Save job data to Excel file"""
        try:
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            df_new = pd.DataFrame([job_data])
            
            if append_mode and os.path.exists(output_path):
                try:
                    df_existing = pd.read_excel(output_path)
                    df_combined = pd.concat([df_existing, df_new], ignore_index=True)
                except:
                    df_combined = df_new
            else:
                df_combined = df_new
            
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                df_combined.to_excel(writer, sheet_name='Indeed_Jobs', index=False)
                
                # Add summary sheet
                summary_data = {
                    'Total Jobs': len(df_combined),
                    'Job Types': df_combined['job_type'].value_counts().to_dict(),
                    'Job Levels': df_combined['job_level'].value_counts().to_dict(),
                    'Remote Options': df_combined['remote_option'].value_counts().to_dict(),
                    'Avg Masculine Score': df_combined['masculine_score'].mean(),
                    'Avg Feminine Score': df_combined['feminine_score'].mean()
                }
                
                summary_df = pd.DataFrame(list(summary_data.items()), columns=['Metric', 'Value'])
                summary_df.to_excel(writer, sheet_name='Summary', index=False)
            
            print(f"\nData saved to: {output_path}")
            print(f"Total jobs in file: {len(df_combined)}")
            return True
            
        except Exception as e:
            print(f"Error saving to Excel: {e}")
            return False

def main():
    """Main function for Indeed job data entry"""
    
    OUTPUT_PATH = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\indeed_uk_jobs.xlsx"
    
    print("="*70)
    print("INDEED UK JOB DATA PROCESSOR")
    print("="*70)
    print("Process Indeed job postings for OR/Analytics positions in the UK")
    print("="*70)
    
    processor = IndeedJobProcessor()
    
    while True:
        print("\n" + "="*40)
        print("NEW INDEED JOB ENTRY")
        print("="*40)
        
        url = input("Enter Indeed job URL: ").strip()
        if not url:
            print("No URL provided. Exiting...")
            break
        
        input_data = processor.get_job_input(url)
        job_data = processor.process_job_data(input_data, url)
        
        # Display results
        print("\n" + "="*60)
        print("PROCESSING RESULTS")
        print("="*60)
        print(f"Job Title: {job_data['job_title']}")
        print(f"Company: {job_data['company_name']}")
        print(f"Location: {job_data['location']}")
        print(f"Job Type: {job_data['job_type']}")
        print(f"Remote Option: {job_data['remote_option']}")
        print(f"Job Level: {job_data['job_level']}")
        print(f"Salary: {job_data['salary_displayed']}")
        print(f"Word Count: {job_data['word_count']}")
        print(f"Masculine Score: {job_data['masculine_score']}%")
        print(f"Feminine Score: {job_data['feminine_score']}%")
        if job_data['gendered_words_found']:
            print(f"Gendered Words: {job_data['gendered_words_found']}")
        
        # Save to Excel
        append_mode = os.path.exists(OUTPUT_PATH)
        success = processor.save_to_excel(job_data, OUTPUT_PATH, append_mode)
        
        if success:
            print("\n✅ Indeed job data saved successfully!")
        else:
            print("\n❌ Failed to save job data.")
        
        continue_choice = input("\nDo you want to add another Indeed job? (y/n): ").strip().lower()
        if continue_choice not in ['y', 'yes']:
            break
    
    print(f"\nAll Indeed job data saved to: {OUTPUT_PATH}")

if __name__ == "__main__":
    main()

INDEED UK JOB DATA PROCESSOR
Process Indeed job postings for OR/Analytics positions in the UK

NEW INDEED JOB ENTRY


Enter Indeed job URL:  https://uk.indeed.com/jobs?q=business+intelligence&l=United+Kingdom&radius=25&start=20&vjk=977a498984e750c4&advn=6271557244739320



INDEED UK JOB DATA ENTRY
URL: https://uk.indeed.com/jobs?q=business+intelligence&l=United+Kingdom&radius=25&start=20&vjk=977a498984e750c4&advn=6271557244739320
------------------------------------------------------------


Job Title:  Data and Business Intelligence Manager
Company Name:  Global Banking School
Location:  Birmingham
Job Type (Full-time/Part-time/Contract/Internship):  
Salary (as displayed on Indeed):  
Date Posted (e.g., '2 days ago'):  
Company Rating (if shown):  



Job Description:
(Paste the full job description. Press Enter twice when finished)
----------------------------------------


 About Us: GBS is a higher education provider offering a range of sector-relevant courses across ten campuses in London, Birmingham, Leeds, and Manchester. Working in partnership with several of the UK’s leading higher education providers, we deliver vocational, undergraduate, and postgraduate programmes in finance, accounting, business, construction, tourism, healthcare, and more.  Our Vision: Changing lives through education.  What We Do: Working with internal stakeholders, you will use your expertise to provide easily accessible data reporting and business intelligence across GBS. In overseeing and managing key projects with internal stakeholders you will lead a team to understand business needs and data requirements to enable management of key performance indicators and improve identified processes and outcomes for GBS from end to end, empower colleagues to understand the key metrics that drive performance throughout the organisation.   Main Responsibilities About the Role:  Work c


PROCESSING RESULTS
Job Title: Data and Business Intelligence Manager
Company: Global Banking School
Location: Birmingham
Job Type: Internship
Remote Option: Hybrid
Job Level: Senior
Salary: 
Word Count: 579
Masculine Score: 1.21%
Feminine Score: 2.25%
Gendered Words: M:lead; F:team; F:understand; F:understand; M:drive; F:team; M:lead; F:assist; M:lead; F:understand; M:excel; F:communicate; F:team; M:analytical; F:share; F:team; M:direct; F:committed; F:collaborative; F:committed

Data saved to: C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\indeed_uk_jobs.xlsx
Total jobs in file: 81

✅ Indeed job data saved successfully!
