In [7]:
import pandas as pd
import re
import os
from datetime import datetime

class JobDatasetFilter:
    def __init__(self):
        # Define relevant keywords for your research project
        self.relevant_keywords = {
            'operations_research': [
                'operations research', 'operational research', 'or analyst', 'or specialist',
                'operations analyst', 'operational analyst', 'management science',
                'optimization', 'linear programming', 'mathematical modeling',
                'supply chain optimization', 'logistics optimization'
            ],
            'data_science': [
                'data scientist', 'data science', 'machine learning', 'artificial intelligence',
                'ai researcher', 'ml engineer', 'data analyst', 'data engineer',
                'statistical analyst', 'quantitative analyst', 'business intelligence',
                'predictive analytics', 'data mining', 'big data'
            ],
            'analytics': [
                'business analyst', 'systems analyst', 'research analyst', 'financial analyst',
                'market research analyst', 'business intelligence analyst', 'reporting analyst',
                'performance analyst', 'strategy analyst', 'planning analyst',
                'decision analyst', 'process analyst'
            ],
            'quantitative_roles': [
                'quantitative', 'quant', 'statistician', 'econometrician', 'actuary',
                'risk analyst', 'credit analyst', 'investment analyst', 'portfolio analyst',
                'algorithmic', 'mathematical', 'computational', 'modeling'
            ],
            'consulting': [
                'management consultant', 'strategy consultant', 'business consultant',
                'operations consultant', 'analytics consultant', 'data consultant',
                'process improvement', 'lean six sigma', 'process optimization'
            ],
            'research_roles': [
                'research scientist', 'researcher', 'research associate', 'research analyst',
                'market researcher', 'user researcher', 'policy researcher',
                'academic researcher', 'research engineer'
            ]
        }
        
        # Combine all keywords into one list for easier searching
        self.all_keywords = []
        for category, keywords in self.relevant_keywords.items():
            self.all_keywords.extend(keywords)
        
        # Industry keywords that are relevant
        self.relevant_industries = [
            'consulting', 'technology', 'finance', 'banking', 'insurance', 'healthcare',
            'pharmaceuticals', 'telecommunications', 'logistics', 'supply chain',
            'manufacturing', 'automotive', 'aerospace', 'defense', 'energy',
            'government', 'public sector', 'research', 'academia', 'university'
        ]
        
        # Keywords to exclude (irrelevant jobs) - ENHANCED VERSION
        self.exclude_keywords = [
            # Vehicle/Automotive
            'vehicle technician', 'motor technician', 'automotive technician', 'lcv technician',
            'mot tester', 'car sales', 'vehicle sales', 'automotive sales',
            
            # Manufacturing/Production 
            'production supervisor', 'production manager', 'production engineer', 'production team leader',
            'production welder', 'production planner', 'manufacturing supervisor', 'manufacturing engineer',
            'manufacturing specialist', 'assembly line', 'factory supervisor', 'plant manager',
            
            # CNC/Machining
            'cnc machinist', 'cnc setter', 'cnc programmer', 'cnc operator', 'cnc miller',
            'machinist', 'machine operator', 'machine minder', 'centreless grinder',
            'tooling engineer', 'skilled finishing operative',
            
            # Legal (non-analytical)
            'paralegal', 'solicitor', 'lawyer', 'litigation', 'conveyancer', 'legal secretary',
            'legal administrator', 'legal executive', 'legal assistant', 'court clerk',
            
            # Basic Technical/Service
            'service technician', 'field technician', 'maintenance technician', 'repair technician',
            'installation technician', 'support technician', 'field engineer', 'service engineer',
            'help desk', 'it support', 'desktop support', 'technical support',
            
            # Manual/Operational roles
            'welder', 'grinder', 'operative', 'packer', 'picker', 'despatch', 'goods in',
            'warehouse', 'forklift', 'driver', 'delivery', 'courier', 'cleaner',
            'security', 'caretaker', 'porter', 'stores', 'dispatch',
            
            # Basic Admin/Customer Service
            'administrator', 'admin assistant', 'office administrator', 'data entry',
            'customer service', 'call centre', 'call center', 'customer advisor',
            'customer service advisor', 'receptionist', 'sales assistant', 'shop assistant',
            
            # Recruitment (non-analytical)
            'trainee recruitment consultant', 'recruitment consultant', 'recruitment administrator',
            'recruitment coordinator', 'resourcer', 'talent acquisition coordinator',
            'graduate recruitment consultant', 'recruitment advisor', 'recruitment researcher',
            
            # Healthcare (non-analytical)
            'care worker', 'care assistant', 'healthcare assistant', 'support worker',
            'nursing', 'nurse', 'physiotherapist', 'occupational therapist',
            'medical technician', 'laboratory technician', 'clinical technician',
            
            # Education (basic)
            'teacher', 'teaching assistant', 'tutor', 'instructor', 'trainer',
            'nursery teacher', 'childcare', 'education assistant',
            
            # Sales (basic)
            'sales manager', 'sales executive', 'sales advisor', 'sales representative',
            'account executive', 'business development executive', 'telesales',
            'telemarketing', 'cold calling'
        ]
        
        # Additional strict exclusion patterns for job titles
        self.strict_exclude_patterns = [
            # Vehicle/Transport
            r'\bvehicle technician\b', r'\bmotor technician\b', r'\blcv technician\b', r'\bmot tester\b',
            
            # Manufacturing/Production
            r'\bproduction\s+(supervisor|manager|engineer|leader|welder|planner)\b',
            r'\bmanufacturing\s+(supervisor|engineer|specialist)\b', r'\bassembly line\b',
            
            # CNC/Machining
            r'\bcnc\s+(machinist|setter|programmer|operator|miller)\b', r'\bmachine\s+(operator|minder)\b',
            
            # Legal
            r'\bparalegal\b', r'\bsolicitor\b', r'\blawyer\b', r'\blegal\s+(secretary|administrator|assistant)\b',
            
            # Basic Technical
            r'\bservice technician\b', r'\bfield technician\b', r'\bmaintenance technician\b',
            r'\bhelp desk\b', r'\bit support\b', r'\bdesktop support\b',
            
            # Manual work
            r'\bwelder\b', r'\bgrinder\b', r'\boperative\b', r'\bpacker\b', r'\bpicker\b',
            r'\bwarehouse\b', r'\bdriver\b', r'\bdelivery\b', r'\bcleaner\b',
            
            # Basic admin
            r'\badmin\s+(assistant|coordinator)\b', r'\bdata entry\b', r'\bcustomer service\b',
            r'\bcall centre\b', r'\bcall center\b', r'\breceptionist\b',
            
            # Recruitment
            r'\btrainee recruitment consultant\b', r'\brecruitment consultant\b',
            r'\brecruitment\s+(administrator|coordinator|advisor)\b',
            
            # Healthcare
            r'\bcare\s+(worker|assistant)\b', r'\bhealthcare assistant\b', r'\bnursing\b',
            
            # Education
            r'\bteacher\b', r'\btutor\b', r'\bteaching assistant\b',
            
            # Basic sales
            r'\bsales\s+(manager|executive|advisor|representative)\b', r'\btelesales\b', r'\btelemarketing\b'
        ]

    def load_dataset(self, file_path):
        """Load the dataset from various file formats"""
        try:
            file_extension = os.path.splitext(file_path)[1].lower()
            
            if file_extension == '.csv':
                # Try different encodings and separators
                encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16', 'utf-8-sig']
                separators = [',', ';', '\t']
                
                for encoding in encodings:
                    for sep in separators:
                        try:
                            print(f"Trying encoding: {encoding} with separator: '{sep}'")
                            df = pd.read_csv(file_path, encoding=encoding, sep=sep, on_bad_lines='skip')
                            
                            # Check if we got a reasonable number of columns
                            if len(df.columns) >= 5:  # Expecting at least 5 columns based on your description
                                print(f"Successfully loaded CSV with {encoding} encoding and '{sep}' separator")
                                print(f"Columns found: {list(df.columns)}")
                                return df
                        except Exception as e:
                            print(f"Failed with {encoding}/{sep}: {str(e)[:50]}...")
                            continue
                
                # If all encodings fail, try with error handling
                print("Trying with error handling...")
                try:
                    df = pd.read_csv(file_path, encoding='utf-8', errors='ignore', on_bad_lines='skip')
                    print("Loaded with error handling (some characters may be missing)")
                    return df
                except:
                    try:
                        df = pd.read_csv(file_path, encoding='latin-1', errors='ignore', on_bad_lines='skip')
                        print("Loaded with latin-1 and error handling")
                        return df
                    except Exception as e:
                        raise Exception(f"Could not read CSV with any method: {e}")
                    
            elif file_extension in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path)
                
            elif file_extension == '.json':
                df = pd.read_json(file_path)
                
            else:
                raise Exception(f"Unsupported file format: {file_extension}")
            
            print(f"Dataset loaded successfully!")
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            
            return df
            
        except Exception as e:
            print(f"Error loading dataset: {e}")
            print("\nTroubleshooting suggestions:")
            print("1. Check if the file is not corrupted")
            print("2. Try opening the file in Excel/text editor to check format")
            print("3. Make sure the file is not being used by another program")
            print("4. Try converting the file to UTF-8 encoding in a text editor")
            return None

    def analyze_dataset_structure(self, df):
        """Analyze the structure of the dataset"""
        print("\n" + "="*60)
        print("DATASET ANALYSIS")
        print("="*60)
        
        print(f"Total rows: {len(df):,}")
        print(f"Total columns: {len(df.columns)}")
        
        print("\nColumn Information:")
        for i, col in enumerate(df.columns):
            non_null_count = df[col].count()
            sample_values = df[col].dropna().head(3).tolist()
            print(f"{i+1:2}. {col}")
            print(f"    Non-null: {non_null_count:,} ({non_null_count/len(df)*100:.1f}%)")
            print(f"    Sample: {sample_values}")
        
        print("\nFirst few rows:")
        print(df.head())
        
        return True

    def identify_text_columns(self, df):
        """Identify which columns contain job titles and descriptions"""
        text_columns = {
            'title': None,
            'description': None,
            'company': None,
            'location': None,
            'requirements': None,
            'category': None
        }
        
        # Specific mapping for your dataset
        column_mapping = {
            'job_title': 'title',
            'job_description': 'description', 
            'company_name': 'company',
            'city': 'location',
            'job_requirements': 'requirements',
            'category': 'category'
        }
        
        for col in df.columns:
            if col in column_mapping:
                text_columns[column_mapping[col]] = col
        
        print("\n" + "="*60)
        print("IDENTIFIED COLUMNS")
        print("="*60)
        for key, value in text_columns.items():
            if value:
                print(f"{key.title()}: {value}")
        
        return text_columns

    def manual_column_selection(self, df):
        """Allow manual selection of columns if auto-detection fails"""
        print("\n" + "="*60)
        print("MANUAL COLUMN SELECTION")
        print("="*60)
        
        columns = list(df.columns)
        for i, col in enumerate(columns):
            print(f"{i+1:2}. {col}")
        
        text_columns = {}
        
        # Get job title column
        while True:
            try:
                choice = input("\nEnter number for JOB TITLE column (or press Enter to skip): ").strip()
                if choice == "":
                    text_columns['title'] = None
                    break
                idx = int(choice) - 1
                if 0 <= idx < len(columns):
                    text_columns['title'] = columns[idx]
                    break
                else:
                    print("Invalid number. Please try again.")
            except ValueError:
                print("Please enter a valid number.")
        
        # Get job description column
        while True:
            try:
                choice = input("Enter number for JOB DESCRIPTION column (or press Enter to skip): ").strip()
                if choice == "":
                    text_columns['description'] = None
                    break
                idx = int(choice) - 1
                if 0 <= idx < len(columns):
                    text_columns['description'] = columns[idx]
                    break
                else:
                    print("Invalid number. Please try again.")
            except ValueError:
                print("Please enter a valid number.")
        
        # Get company column
        while True:
            try:
                choice = input("Enter number for COMPANY column (or press Enter to skip): ").strip()
                if choice == "":
                    text_columns['company'] = None
                    break
                idx = int(choice) - 1
                if 0 <= idx < len(columns):
                    text_columns['company'] = columns[idx]
                    break
                else:
                    print("Invalid number. Please try again.")
            except ValueError:
                print("Please enter a valid number.")
        
        # Get location column
        while True:
            try:
                choice = input("Enter number for LOCATION column (or press Enter to skip): ").strip()
                if choice == "":
                    text_columns['location'] = None
                    break
                idx = int(choice) - 1
                if 0 <= idx < len(columns):
                    text_columns['location'] = columns[idx]
                    break
                else:
                    print("Invalid number. Please try again.")
            except ValueError:
                print("Please enter a valid number.")
        
        return text_columns

    def check_keyword_match(self, text, keywords, exclude_keywords=None):
        """Check if text contains any of the keywords with enhanced filtering"""
        if not text or pd.isna(text):
            return False, []
        
        text_lower = str(text).lower()
        
        # Priority inclusion list - these should ALWAYS be included even if they match exclude patterns
        priority_include = [
            'data scientist', 'data analyst', 'business analyst', 'research analyst',
            'quantitative analyst', 'financial analyst', 'credit analyst', 'risk analyst',
            'operations analyst', 'business intelligence', 'machine learning', 'statistician',
            'econometrician', 'actuarial analyst', 'modelling', 'optimization',
            'operations research', 'management science', 'decision science'
        ]
        
        # Check for priority inclusion first
        for priority_term in priority_include:
            if priority_term in text_lower:
                return True, [f"PRIORITY: {priority_term}"]
        
        # Check for strict exclude patterns (using regex for exact matches)
        for pattern in self.strict_exclude_patterns:
            if re.search(pattern, text_lower):
                matched_word = re.search(pattern, text_lower).group()
                return False, [f"EXCLUDED: {matched_word}"]
        
        # Check for exclude keywords
        if exclude_keywords:
            for exclude_word in exclude_keywords:
                if exclude_word.lower() in text_lower:
                    return False, [f"EXCLUDED: {exclude_word}"]
        
        # Check for relevant keywords
        found_keywords = []
        for keyword in keywords:
            if keyword.lower() in text_lower:
                found_keywords.append(keyword)
        
        return len(found_keywords) > 0, found_keywords

    def filter_relevant_jobs(self, df, text_columns, strict_mode=False):
        """Filter the dataset for relevant jobs"""
        print("\n" + "="*60)
        print("FILTERING JOBS")
        print("="*60)
        
        relevant_jobs = []
        total_jobs = len(df)
        
        for idx, row in df.iterrows():
            if idx % 5000 == 0:
                print(f"Processed {idx:,} of {total_jobs:,} jobs...")
            
            # Combine title, description, requirements, and category for searching
            search_text = ""
            found_keywords = []
            
            if text_columns['title'] and not pd.isna(row[text_columns['title']]):
                search_text += str(row[text_columns['title']]) + " "
            
            if text_columns['description'] and not pd.isna(row[text_columns['description']]):
                search_text += str(row[text_columns['description']]) + " "
            
            if text_columns['requirements'] and not pd.isna(row[text_columns['requirements']]):
                search_text += str(row[text_columns['requirements']]) + " "
                
            if text_columns['category'] and not pd.isna(row[text_columns['category']]):
                search_text += str(row[text_columns['category']]) + " "
            
            if text_columns['company'] and not pd.isna(row[text_columns['company']]):
                search_text += str(row[text_columns['company']]) + " "
            
            # Check for relevance
            is_relevant, keywords_found = self.check_keyword_match(
                search_text, self.all_keywords, self.exclude_keywords
            )
            
            if is_relevant:
                # Add additional information to the row
                row_dict = row.to_dict()
                row_dict['keywords_found'] = '; '.join(keywords_found)
                row_dict['filter_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                relevant_jobs.append(row_dict)
        
        print(f"\nFiltering complete!")
        print(f"Original jobs: {total_jobs:,}")
        print(f"Relevant jobs found: {len(relevant_jobs):,}")
        print(f"Percentage relevant: {len(relevant_jobs)/total_jobs*100:.2f}%")
        
        return pd.DataFrame(relevant_jobs)

    def save_filtered_data(self, filtered_df, output_path):
        """Save the filtered dataset"""
        try:
            # Ensure output directory exists
            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            # Save to Excel
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                filtered_df.to_excel(writer, sheet_name='Relevant_Jobs', index=False)
            
            print(f"\nFiltered data saved to: {output_path}")
            return True
            
        except Exception as e:
            print(f"Error saving filtered data: {e}")
            return False

    def show_sample_results(self, filtered_df, text_columns, n_samples=10):
        """Show sample results from filtering"""
        print("\n" + "="*60)
        print("SAMPLE FILTERED RESULTS")
        print("="*60)
        
        if len(filtered_df) == 0:
            print("No relevant jobs found!")
            return
        
        sample_df = filtered_df.head(n_samples)
        
        for idx, row in sample_df.iterrows():
            print(f"\nJob {idx + 1}:")
            print("-" * 30)
            if text_columns['title']:
                print(f"Title: {row[text_columns['title']]}")
            if text_columns['company']:
                print(f"Company: {row[text_columns['company']]}")
            if text_columns['location']:
                print(f"Location: {row[text_columns['location']]}")
            if text_columns['category']:
                print(f"Category: {row[text_columns['category']]}")
            if 'keywords_found' in row:
                print(f"Keywords Found: {row['keywords_found']}")
        
        # Ask user if they want to add more exclusions
        print(f"\n" + "="*60)
        print("QUALITY CHECK")
        print("="*60)
        print("Review the sample results above. Do you see any irrelevant jobs?")
        print("If yes, you can add custom exclusions to improve the filtering.")
        
        add_exclusions = input("\nDo you want to add custom exclusion keywords? (y/n): ").strip().lower()
        
        if add_exclusions in ['y', 'yes']:
            print("\nEnter keywords to exclude (separated by commas):")
            print("Example: sales rep, customer service, admin clerk")
            custom_exclusions = input("Keywords to exclude: ").strip()
            
            if custom_exclusions:
                new_exclusions = [keyword.strip() for keyword in custom_exclusions.split(',')]
                return new_exclusions
        
        return None

def main():
    """Main function to run the job filtering process"""
    
    print("="*70)
    print("UK JOBS DATASET FILTER - RELEVANT JOBS EXTRACTOR")
    print("="*70)
    print("This tool filters large job datasets to find relevant positions")
    print("for Operations Research, Data Science, and Analytics roles.")
    print("="*70)
    
    # Pre-configured paths for your Excel dataset
    input_path = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\50K jobs.xlsx"
    output_path = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\50K_jobs_filtered.xlsx"
    
    print(f"Input file: {input_path}")
    print(f"Output file: {output_path}")
    
    # Initialize filter
    filter_tool = JobDatasetFilter()
    
    # Check if input file exists
    if not os.path.exists(input_path):
        print("Input file not found! Please check the path.")
        print("Looking for: 50K jobs.xlsx")
        print("In folder: Dataset")
        return
    
    # Load dataset
    print("\nLoading Excel dataset...")
    print("This may take a moment for large files...")
    df = filter_tool.load_dataset(input_path)
    if df is None:
        return
    
    # Analyze dataset structure
    filter_tool.analyze_dataset_structure(df)
    
    # Identify text columns (pre-configured for your dataset)
    text_columns = filter_tool.identify_text_columns(df)
    
    # Show what will be searched
    print("\nColumns that will be searched for relevant keywords:")
    for key, col in text_columns.items():
        if col:
            print(f"  {key.title()}: {col}")
    
    # Filter relevant jobs
    print(f"\nStarting to filter {len(df):,} jobs...")
    print("This may take a few minutes for large datasets...")
    
    filtered_df = filter_tool.filter_relevant_jobs(df, text_columns)
    
    # Show sample results and get custom exclusions
    custom_exclusions = filter_tool.show_sample_results(filtered_df, text_columns)
    
    # If user wants to add custom exclusions, re-filter
    if custom_exclusions:
        print(f"\nRe-filtering with custom exclusions: {custom_exclusions}")
        filter_tool.exclude_keywords.extend(custom_exclusions)
        filtered_df = filter_tool.filter_relevant_jobs(df, text_columns)
        print("\nRe-filtering complete!")
        
        # Show new sample
        filter_tool.show_sample_results(filtered_df, text_columns, n_samples=5)
    
    # Save filtered data
    if len(filtered_df) > 0:
        success = filter_tool.save_filtered_data(filtered_df, output_path)
        
        if success:
            print("\n" + "="*70)
            print("FILTERING COMPLETED SUCCESSFULLY!")
            print("="*70)
            print(f"Original dataset: {len(df):,} jobs")
            print(f"Filtered dataset: {len(filtered_df):,} relevant jobs")
            print(f"Reduction: {(1 - len(filtered_df)/len(df))*100:.1f}%")
            print(f"Saved to: {output_path}")
            
            # Show category breakdown if available
            if 'category' in filtered_df.columns:
                print(f"\nTop job categories in filtered results:")
                category_counts = filtered_df['category'].value_counts().head(10)
                for category, count in category_counts.items():
                    print(f"  {category}: {count}")
                    
        else:
            print("Failed to save filtered data.")
    else:
        print("\nNo relevant jobs found with current criteria.")
        print("You may need to adjust the keywords or filtering criteria.")

if __name__ == "__main__":
    main()

UK JOBS DATASET FILTER - RELEVANT JOBS EXTRACTOR
This tool filters large job datasets to find relevant positions
for Operations Research, Data Science, and Analytics roles.
Input file: C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\50K jobs_filtered.xlsx
Output file: C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\50K_jobs_fil.xlsx
Input file not found! Please check the path.
Looking for: 50K jobs.xlsx
In folder: Dataset
