In [9]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import os

class FinalORJobLexiconAnalyzer:
    def __init__(self):
        """Initialize with comprehensive gendered word lexicons matching your survey"""
        
        # Masculine-coded words (from your survey and Gaucher et al. 2011)
        self.masculine_words = [
            # Core masculine terms from survey
            'competitive', 'lead', 'result-oriented', 'analytical', 'objective',
            'independent', 'autonomous', 'driven', 'decision-making', 'confident',
            'self-sufficient',
            
            # Extended masculine terms from your other datasets
            'leader', 'dominant', 'assertive', 'aggressive', 'ambitious',
            'decisive', 'determined', 'strong', 'superior', 'manage', 'direct', 
            'control', 'drive', 'challenge', 'compete', 'win', 'achieve', 
            'dominate', 'excel', 'individual', 'hierarchy', 'decision', 
            'responsibility', 'active', 'adventurous', 'athletic', 'battle', 
            'boast', 'champion', 'courageous', 'defend', 'determine', 'fearless',
            'fight', 'force', 'greedy', 'headstrong', 'hostile', 'impulsive',
            'intellect', 'logic', 'masculine', 'outgoing', 'outspoken', 'persist',
            'principle', 'reckless', 'self-confident', 'self-reliant', 'strength',
            'stubborn', 'unreasonable', 'winner', 'results-driven', 'performance',
            'efficiency', 'optimization', 'strategic', 'tactical', 'execution',
            'implementation', 'delivery', 'targets', 'metrics'
        ]
        
        # Feminine-coded words (from your survey and research)
        self.feminine_words = [
            # Core feminine terms from survey
            'inclusive', 'collaborate', 'responsive', 'nurturing', 'empathetic',
            'compassionate', 'ambitious', 'motivated', 'team-player', 'interpersonal-skills',
            'support',
            
            # Extended feminine terms from your other datasets
            'collaborative', 'cooperative', 'supportive', 'communicate', 'understand', 
            'responsible', 'connect', 'honest', 'loyal', 'dependable', 'committed', 
            'dedicated', 'help', 'assist', 'care', 'share', 'together', 'team', 
            'community', 'relationship', 'trust', 'warm', 'kind', 'agree', 
            'affectionate', 'child', 'cheer', 'collab', 'commit', 'communal', 
            'compassion', 'considerate', 'cooperate', 'co-operate', 'depend',
            'emotional', 'empathy', 'feel', 'feeling', 'feminine', 'flatterable',
            'gentle', 'interdependent', 'interpersonal', 'intimate', 'kinship',
            'modesty', 'nag', 'nice', 'nurture', 'pleasant', 'polite', 'quiet',
            'sensitive', 'submissive', 'sympathy', 'tender', 'whiny', 'yield',
            'caring', 'helping', 'communication', 'stakeholder', 'facilitation',
            'coordination', 'consultation', 'liaison', 'partnership', 'consensus',
            'engagement'
        ]
        
        # Fast-paced/High-performing terms (can be considered masculine-leaning)
        self.fast_paced_words = [
            'fast-paced', 'high-performing', 'fast', 'rapid', 'quick', 'speed',
            'urgent', 'immediate', 'accelerated', 'dynamic', 'intense'
        ]
        
        # Add fast-paced terms to masculine list
        self.masculine_words.extend(self.fast_paced_words)

    def clean_text(self, text):
        """Clean and normalize text for analysis"""
        if pd.isna(text) or text is None:
            return ""
        
        # Convert to string and lowercase
        text = str(text).lower()
        
        # Handle compound words and hyphenated terms
        text = text.replace('-', ' ')
        text = text.replace('_', ' ')
        
        # Remove extra whitespace and normalize
        text = ' '.join(text.split())
        
        # Remove special characters but keep important punctuation
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()

    def calculate_lexicon_scores(self, text):
        """Calculate gendered language scores using lexicon approach"""
        if not text:
            return {
                'masculine_count': 0,
                'feminine_count': 0,
                'masculine_score': 0.0,
                'feminine_score': 0.0,
                'neutral_score': 100.0,
                'gender_ratio': 0.0,
                'gender_classification': 'Neutral',
                'total_words': 0,
                'gendered_words_found': ''
            }
        
        # Clean and tokenize
        clean_text = self.clean_text(text)
        words = re.findall(r'\b[a-z]+\b', clean_text)
        total_words = len(words)
        
        if total_words == 0:
            return {
                'masculine_count': 0,
                'feminine_count': 0,
                'masculine_score': 0.0,
                'feminine_score': 0.0,
                'neutral_score': 100.0,
                'gender_ratio': 0.0,
                'gender_classification': 'Neutral',
                'total_words': 0,
                'gendered_words_found': ''
            }
        
        # Count gendered words
        masculine_words_found = []
        feminine_words_found = []
        
        for word in words:
            if word in self.masculine_words:
                masculine_words_found.append(word)
            elif word in self.feminine_words:
                feminine_words_found.append(word)
        
        masculine_count = len(masculine_words_found)
        feminine_count = len(feminine_words_found)
        
        # Calculate scores as percentage of total words
        masculine_score = round((masculine_count / total_words) * 100, 2)
        feminine_score = round((feminine_count / total_words) * 100, 2)
        neutral_score = round(100 - masculine_score - feminine_score, 2)
        
        # Calculate gender ratio (masculine - feminine)
        gender_ratio = round(masculine_score - feminine_score, 2)
        
        # Classify based on threshold (following academic literature)
        if gender_ratio > 1.0:
            classification = 'Masculine-coded'
        elif gender_ratio < -1.0:
            classification = 'Feminine-coded'
        else:
            classification = 'Neutral'
        
        # Create summary of found words
        gendered_words_summary = []
        if masculine_words_found:
            unique_masc = list(set(masculine_words_found))
            gendered_words_summary.append(f"M: {', '.join(unique_masc)}")
        if feminine_words_found:
            unique_fem = list(set(feminine_words_found))
            gendered_words_summary.append(f"F: {', '.join(unique_fem)}")
        
        return {
            'masculine_count': masculine_count,
            'feminine_count': feminine_count,
            'masculine_score': masculine_score,
            'feminine_score': feminine_score,
            'neutral_score': neutral_score,
            'gender_ratio': gender_ratio,
            'gender_classification': classification,
            'total_words': total_words,
            'gendered_words_found': ' | '.join(gendered_words_summary)
        }

    def read_dataset(self, file_path, dataset_name):
        """Read dataset with proper encoding handling"""
        
        print(f"Reading {dataset_name}...")
        print(f"File: {file_path}")
        
        try:
            if not os.path.exists(file_path):
                print(f"⚠️  File not found: {file_path}")
                return None
            
            # Determine file type and read accordingly
            if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
                df = pd.read_excel(file_path, engine='openpyxl')
            else:
                # Try different encodings for CSV
                encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
                df = None
                
                for encoding in encodings:
                    try:
                        df = pd.read_csv(file_path, encoding=encoding, low_memory=False)
                        print(f"  Successfully read with {encoding} encoding")
                        break
                    except:
                        continue
                
                if df is None:
                    raise ValueError(f"Could not read {file_path} with any encoding")
            
            print(f"✅ Successfully read {dataset_name}!")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {df.columns.tolist()}")
            
            # Add dataset identifier
            df['dataset_source'] = dataset_name
            
            return df
            
        except Exception as e:
            print(f"❌ Error reading {dataset_name}: {e}")
            return None

    def standardize_complete_dataset(self, df):
        """Standardize the Complete Project Dataset"""
        
        # Create standardized columns
        standardized_df = df.copy()
        
        # Map columns to standard names
        column_mapping = {
            'job_title': 'job_title',
            'company_name': 'company_name', 
            'city': 'location',
            'category': 'category',
            'job_board_description': 'job_description',
            'requirements': 'requirements',
            'post_date': 'posting_date',
            'salary_offered': 'salary_info'
        }
        
        # Apply mappings where columns exist
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns and old_col != new_col:
                standardized_df = standardized_df.rename(columns={old_col: new_col})
        
        # Create combined text for analysis
        text_columns = ['job_title', 'job_description', 'requirements']
        combined_text = []
        
        for idx, row in standardized_df.iterrows():
            text_parts = []
            
            for col in text_columns:
                if col in standardized_df.columns and pd.notna(row[col]):
                    text_parts.append(str(row[col]))
            
            combined_text.append(' '.join(text_parts))
        
        standardized_df['combined_text_for_analysis'] = combined_text
        
        return standardized_df

    def standardize_other_datasets(self, df, dataset_name):
        """Standardize other datasets to match Complete Project format"""
        
        standardized_df = df.copy()
        
        # Common standardization based on dataset type
        if dataset_name == 'Indeed_UK':
            if 'job_description_cleaned' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_cleaned']
            elif 'job_description_raw' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_raw']
        
        elif dataset_name == 'OR_Society':
            if 'job_description_cleaned' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_cleaned']
            elif 'job_description_raw' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_raw']
        
        elif dataset_name == 'Jobs_AC_UK':
            if 'job_description_cleaned' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_cleaned']
            elif 'job_description_raw' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_raw']
        
        elif dataset_name == 'LinkedIn_Jobs':
            if 'job_description_cleaned' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_cleaned']
            elif 'job_description_raw' in df.columns:
                standardized_df['combined_text_for_analysis'] = df['job_description_raw']
        
        # If no specific text column found, try to create one
        if 'combined_text_for_analysis' not in standardized_df.columns:
            # Look for common text columns
            text_cols = ['job_title', 'description', 'job_description', 'summary']
            combined_text = []
            
            for idx, row in standardized_df.iterrows():
                text_parts = []
                for col in text_cols:
                    if col in standardized_df.columns and pd.notna(row[col]):
                        text_parts.append(str(row[col]))
                combined_text.append(' '.join(text_parts))
            
            standardized_df['combined_text_for_analysis'] = combined_text
        
        return standardized_df

    def add_lexicon_analysis_to_dataset(self, df, dataset_name):
        """Add lexicon analysis to any dataset"""
        
        print(f"\nAdding lexicon analysis to {dataset_name}...")
        
        # Ensure we have text to analyze
        if 'combined_text_for_analysis' not in df.columns:
            print(f"⚠️  No analysis text found for {dataset_name}")
            return df
        
        # Apply lexicon analysis
        print(f"  Calculating lexicon scores...")
        
        lexicon_results = df['combined_text_for_analysis'].fillna('').apply(self.calculate_lexicon_scores)
        
        # Extract results into separate columns
        for key in ['masculine_count', 'feminine_count', 'masculine_score', 
                   'feminine_score', 'neutral_score', 'gender_ratio', 
                   'gender_classification', 'total_words', 'gendered_words_found']:
            df[f'lexicon_{key}'] = [result[key] for result in lexicon_results]
        
        # Add analysis metadata
        df['analysis_date'] = datetime.now().strftime('%Y-%m-%d')
        df['lexicon_version'] = 'Gaucher_2011_Extended_Survey_Based'
        
        print(f"✅ Lexicon analysis added to {dataset_name}")
        
        return df

    def combine_all_datasets(self, dataset_paths):
        """Combine all 5 datasets with lexicon analysis"""
        
        print("="*70)
        print("COMBINING ALL OR JOB DATASETS WITH LEXICON ANALYSIS")
        print("="*70)
        
        processed_datasets = []
        
        for dataset_name, file_path in dataset_paths.items():
            # Read dataset
            df = self.read_dataset(file_path, dataset_name)
            
            if df is not None:
                # Standardize based on dataset type
                if dataset_name == 'Complete_Project_Dataset':
                    df_standardized = self.standardize_complete_dataset(df)
                else:
                    df_standardized = self.standardize_other_datasets(df, dataset_name)
                
                # Add lexicon analysis
                df_with_lexicon = self.add_lexicon_analysis_to_dataset(df_standardized, dataset_name)
                
                processed_datasets.append(df_with_lexicon)
                print(f"✅ Processed {dataset_name}: {len(df_with_lexicon)} jobs")
            else:
                print(f"❌ Skipped {dataset_name}: Could not read file")
        
        if not processed_datasets:
            raise ValueError("No datasets could be processed")
        
        # Get all unique columns across datasets
        all_columns = set()
        for df in processed_datasets:
            all_columns.update(df.columns)
        
        # Ensure all dataframes have the same columns (fill missing with None)
        for i, df in enumerate(processed_datasets):
            for col in all_columns:
                if col not in df.columns:
                    df[col] = None
            processed_datasets[i] = df[list(all_columns)]
        
        # Combine all datasets
        print(f"\nCombining {len(processed_datasets)} datasets...")
        combined_df = pd.concat(processed_datasets, ignore_index=True, sort=False)
        
        print(f"✅ Combined dataset shape: {combined_df.shape}")
        
        return combined_df

    def generate_comprehensive_analysis(self, df):
        """Generate comprehensive analysis of the combined dataset"""
        
        print("\n" + "="*70)
        print("COMPREHENSIVE COMBINED DATASET ANALYSIS")
        print("="*70)
        
        # Basic statistics
        total_jobs = len(df)
        print(f"📊 Total jobs analyzed: {total_jobs:,}")
        
        # Dataset source breakdown
        if 'dataset_source' in df.columns:
            print(f"\n📁 Jobs by Source:")
            source_counts = df['dataset_source'].value_counts()
            for source, count in source_counts.items():
                percentage = (count / total_jobs) * 100
                print(f"   {source}: {count:,} ({percentage:.1f}%)")
        
        # Gender classification breakdown
        if 'lexicon_gender_classification' in df.columns:
            print(f"\n🏷️  Overall Gender Classification:")
            gender_counts = df['lexicon_gender_classification'].value_counts()
            for classification, count in gender_counts.items():
                percentage = (count / total_jobs) * 100
                print(f"   {classification}: {count:,} ({percentage:.1f}%)")
            
            # Average scores
            print(f"\n📈 Overall Average Lexicon Scores:")
            print(f"   Masculine score: {df['lexicon_masculine_score'].mean():.2f}%")
            print(f"   Feminine score: {df['lexicon_feminine_score'].mean():.2f}%")
            print(f"   Neutral score: {df['lexicon_neutral_score'].mean():.2f}%")
            print(f"   Gender ratio: {df['lexicon_gender_ratio'].mean():.2f}")
            
            # Gender analysis by source
            if 'dataset_source' in df.columns:
                print(f"\n📊 Gender Classification by Source:")
                gender_by_source = pd.crosstab(
                    df['dataset_source'], 
                    df['lexicon_gender_classification'], 
                    normalize='index'
                ) * 100
                
                for source in gender_by_source.index:
                    print(f"\n   {source}:")
                    for gender_class in gender_by_source.columns:
                        percentage = gender_by_source.loc[source, gender_class]
                        print(f"     {gender_class}: {percentage:.1f}%")
        
        # Word analysis
        if 'lexicon_total_words' in df.columns:
            print(f"\n📝 Text Analysis:")
            print(f"   Average words per job: {df['lexicon_total_words'].mean():.1f}")
            print(f"   Total words analyzed: {df['lexicon_total_words'].sum():,}")
            print(f"   Jobs with <50 words: {(df['lexicon_total_words'] < 50).sum()}")
            print(f"   Jobs with 50+ words: {(df['lexicon_total_words'] >= 50).sum()}")
        
        return df

    def save_final_results(self, df, output_path):
        """Save final combined results with comprehensive analysis"""
        
        print(f"\n💾 Saving final results...")
        
        try:
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                # Main combined dataset
                df.to_excel(writer, sheet_name='All_Jobs_Combined', index=False)
                
                # Summary statistics
                summary_data = []
                if 'lexicon_gender_classification' in df.columns:
                    gender_counts = df['lexicon_gender_classification'].value_counts()
                    source_counts = df['dataset_source'].value_counts() if 'dataset_source' in df.columns else {}
                    
                    summary_data = [
                        ['Total Jobs', len(df)],
                        ['Total Datasets', df['dataset_source'].nunique() if 'dataset_source' in df.columns else 'N/A'],
                        ['Masculine-coded Jobs', gender_counts.get('Masculine-coded', 0)],
                        ['Feminine-coded Jobs', gender_counts.get('Feminine-coded', 0)],
                        ['Neutral Jobs', gender_counts.get('Neutral', 0)],
                        ['Avg Masculine Score (%)', round(df['lexicon_masculine_score'].mean(), 2)],
                        ['Avg Feminine Score (%)', round(df['lexicon_feminine_score'].mean(), 2)],
                        ['Avg Neutral Score (%)', round(df['lexicon_neutral_score'].mean(), 2)],
                        ['Avg Gender Ratio', round(df['lexicon_gender_ratio'].mean(), 2)],
                        ['Total Words Analyzed', df['lexicon_total_words'].sum() if 'lexicon_total_words' in df.columns else 'N/A']
                    ]
                
                summary_df = pd.DataFrame(summary_data, columns=['Metric', 'Value'])
                summary_df.to_excel(writer, sheet_name='Overall_Summary', index=False)
                
                # Source breakdown
                if 'dataset_source' in df.columns:
                    source_breakdown = df['dataset_source'].value_counts().reset_index()
                    source_breakdown.columns = ['Dataset_Source', 'Job_Count']
                    source_breakdown['Percentage'] = (source_breakdown['Job_Count'] / len(df) * 100).round(1)
                    source_breakdown.to_excel(writer, sheet_name='Source_Breakdown', index=False)
                
                # Gender analysis by source
                if 'dataset_source' in df.columns and 'lexicon_gender_classification' in df.columns:
                    gender_by_source = pd.crosstab(
                        df['dataset_source'], 
                        df['lexicon_gender_classification']
                    ).reset_index()
                    gender_by_source.to_excel(writer, sheet_name='Gender_by_Source', index=False)
                    
                    # Percentage breakdown
                    gender_by_source_pct = pd.crosstab(
                        df['dataset_source'], 
                        df['lexicon_gender_classification'],
                        normalize='index'
                    ).round(3) * 100
                    gender_by_source_pct.to_excel(writer, sheet_name='Gender_by_Source_Percent')
                
                # Top gendered words analysis
                if 'lexicon_gendered_words_found' in df.columns:
                    all_gendered_words = []
                    for words_str in df['lexicon_gendered_words_found'].dropna():
                        if words_str:
                            parts = words_str.split(' | ')
                            for part in parts:
                                if part.startswith('M: '):
                                    words = part[3:].split(', ')
                                    for word in words:
                                        all_gendered_words.append(('Masculine', word.strip()))
                                elif part.startswith('F: '):
                                    words = part[3:].split(', ')
                                    for word in words:
                                        all_gendered_words.append(('Feminine', word.strip()))
                    
                    if all_gendered_words:
                        words_df = pd.DataFrame(all_gendered_words, columns=['Type', 'Word'])
                        word_counts = words_df.groupby(['Type', 'Word']).size().reset_index(name='Count')
                        word_counts = word_counts.sort_values(['Type', 'Count'], ascending=[True, False])
                        word_counts.to_excel(writer, sheet_name='Top_Gendered_Words', index=False)
            
            print(f"✅ Final results saved to: {output_path}")
            return True
            
        except Exception as e:
            print(f"❌ Error saving results: {e}")
            return False

def main():
    """Main function to process all datasets and combine with lexicon analysis"""
    
    # Define all dataset paths
    dataset_paths = {
        'Complete_Project_Dataset': r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\Complete Project Dataset.xlsx",
        'Indeed_UK': r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\indeed_uk_jobs.xlsx",
        'OR_Society': r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\or_society_jobs.xlsx", 
        'Jobs_AC_UK': r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\jobs_ac_uk_data.xlsx",
        'LinkedIn_Jobs': r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\linkedin_jobs_data.xlsx"
    }
    
    # Output path
    output_path = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\Final_Combined_OR_Jobs_with_Lexicon_Analysis.xlsx"
    
    try:
        print("="*70)
        print("FINAL OR JOBS LEXICON ANALYSIS - ALL 5 DATASETS COMBINED")
        print("="*70)
        print("This will process and combine all your OR job datasets with")
        print("comprehensive lexicon-based gender analysis.")
        print("="*70)
        
        # Initialize analyzer
        analyzer = FinalORJobLexiconAnalyzer()
        
        # Process and combine all datasets
        combined_df = analyzer.combine_all_datasets(dataset_paths)
        
        # Generate comprehensive analysis
        final_df = analyzer.generate_comprehensive_analysis(combined_df)
        
        # Save final results
        success = analyzer.save_final_results(final_df, output_path)
        
        if success:
            print(f"\n🎉 ANALYSIS COMPLETE!")
            print(f"📁 Final results saved to: {output_path}")
            print(f"📊 Total jobs processed: {len(final_df):,}")
            
            # Show sample of results
            if 'lexicon_gender_classification' in final_df.columns:
                print(f"\n📋 Sample Results:")
                sample_cols = ['dataset_source', 'job_title', 'company_name', 
                              'lexicon_gender_classification', 'lexicon_masculine_score', 
                              'lexicon_feminine_score', 'lexicon_gender_ratio']
                available_cols = [col for col in sample_cols if col in final_df.columns]
                
                sample_data = final_df[available_cols].head(10)
                print(sample_data.to_string(index=False))
                
                print(f"\n🏆 SUMMARY:")
                gender_summary = final_df['lexicon_gender_classification'].value_counts()
                for classification, count in gender_summary.items():
                    percentage = (count / len(final_df)) * 100
                    print(f"   {classification}: {count:,} jobs ({percentage:.1f}%)")
        
        return final_df
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

if __name__ == "__main__":
    main()

FINAL OR JOBS LEXICON ANALYSIS - ALL 5 DATASETS COMBINED
This will process and combine all your OR job datasets with
comprehensive lexicon-based gender analysis.
COMBINING ALL OR JOB DATASETS WITH LEXICON ANALYSIS
Reading Complete_Project_Dataset...
File: C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\Complete Project Dataset.xlsx
✅ Successfully read Complete_Project_Dataset!
   Shape: (1022, 12)
   Columns: ['category', 'city', 'company_name', 'geo', 'job_board', 'job_description', 'job_requirements', 'job_title', 'job_type', 'post_date', 'salary_offered', 'state']

Adding lexicon analysis to Complete_Project_Dataset...
  Calculating lexicon scores...
✅ Lexicon analysis added to Complete_Project_Dataset
✅ Processed Complete_Project_Dataset: 1022 jobs
Reading Indeed_UK...
File: C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Codes\indeed_uk_jobs.xlsx
✅ Successfully read Indeed_UK!
   

  combined_df = pd.concat(processed_datasets, ignore_index=True, sort=False)


✅ Combined dataset shape: (1233, 57)

COMPREHENSIVE COMBINED DATASET ANALYSIS
📊 Total jobs analyzed: 1,233

📁 Jobs by Source:
   Complete_Project_Dataset: 1,022 (82.9%)
   Indeed_UK: 81 (6.6%)
   OR_Society: 60 (4.9%)
   Jobs_AC_UK: 60 (4.9%)
   LinkedIn_Jobs: 10 (0.8%)

🏷️  Overall Gender Classification:
   Neutral: 769 (62.4%)
   Masculine-coded: 277 (22.5%)
   Feminine-coded: 187 (15.2%)

📈 Overall Average Lexicon Scores:
   Masculine score: 1.71%
   Feminine score: 1.50%
   Neutral score: 96.79%
   Gender ratio: 0.21

📊 Gender Classification by Source:

   Complete_Project_Dataset:
     Feminine-coded: 13.1%
     Masculine-coded: 24.7%
     Neutral: 62.2%

   Indeed_UK:
     Feminine-coded: 23.5%
     Masculine-coded: 14.8%
     Neutral: 61.7%

   Jobs_AC_UK:
     Feminine-coded: 26.7%
     Masculine-coded: 5.0%
     Neutral: 68.3%

   LinkedIn_Jobs:
     Feminine-coded: 50.0%
     Masculine-coded: 10.0%
     Neutral: 40.0%

   OR_Society:
     Feminine-coded: 21.7%
     Masculine-