In [2]:
pip install textblob vaderSentiment nltk

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
python -m textblob.download_corpora

SyntaxError: invalid syntax (2621292756.py, line 1)

In [4]:
pip install --user textblob vaderSentiment nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
# test_packages.py
try:
    from textblob import TextBlob
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    import nltk
    print("✅ All packages installed successfully!")
    
    # Test basic functionality
    analyzer = SentimentIntensityAnalyzer()
    blob = TextBlob("This is a test")
    print("✅ Packages working correctly!")
    
except ImportError as e:
    print(f"❌ Import error: {e}")

✅ All packages installed successfully!
✅ Packages working correctly!


In [7]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import os
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# For more advanced sentiment analysis
try:
    import nltk
    from nltk.sentiment import SentimentIntensityAnalyzer as NLTKSentimentAnalyzer
    from nltk.tokenize import word_tokenize, sent_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    
    # Download required NLTK data
    nltk.download('punkt', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    
    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False
    print("⚠️  NLTK not available. Using basic sentiment analysis only.")

class ORJobSentimentAnalyzer:
    def __init__(self):
        """Initialize sentiment analyzer with multiple approaches"""
        
        # Initialize VADER sentiment analyzer
        self.vader_analyzer = SentimentIntensityAnalyzer()
        
        # Initialize NLTK components if available
        if NLTK_AVAILABLE:
            self.nltk_analyzer = NLTKSentimentAnalyzer()
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
        
        # OR-specific positive sentiment words (indicating confidence, competence)
        self.or_positive_words = [
            'excellent', 'outstanding', 'exceptional', 'innovative', 'leading',
            'cutting-edge', 'state-of-the-art', 'world-class', 'premier', 'top-tier',
            'advanced', 'sophisticated', 'dynamic', 'progressive', 'forward-thinking',
            'strategic', 'impactful', 'successful', 'effective', 'efficient',
            'robust', 'comprehensive', 'extensive', 'diverse', 'inclusive',
            'collaborative', 'supportive', 'rewarding', 'exciting', 'challenging',
            'opportunity', 'growth', 'development', 'career', 'advancement'
        ]
        
        # OR-specific negative sentiment words (indicating barriers, difficulty)
        self.or_negative_words = [
            'demanding', 'pressure', 'stress', 'intense', 'rigid', 'strict',
            'challenging', 'difficult', 'complex', 'overwhelming', 'competitive',
            'aggressive', 'fast-paced', 'high-pressure', 'demanding',
            'must', 'required', 'essential', 'critical', 'mandatory'
        ]
        
        # Gender-related sentiment patterns
        self.masculine_sentiment_patterns = [
            r'dominate\w*', r'lead\w*', r'control\w*', r'manage\w*', r'direct\w*',
            r'achieve\w*', r'win\w*', r'excel\w*', r'compete\w*', r'drive\w*'
        ]
        
        self.feminine_sentiment_patterns = [
            r'support\w*', r'help\w*', r'assist\w*', r'collaborate\w*', r'cooperate\w*',
            r'nurture\w*', r'care\w*', r'understand\w*', r'empathize\w*', r'connect\w*'
        ]

    def clean_text_for_sentiment(self, text):
        """Clean text specifically for sentiment analysis"""
        if pd.isna(text) or text is None:
            return ""
        
        # Convert to string
        text = str(text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove special characters but keep sentence structure
        text = re.sub(r'[^\w\s\.\!\?\;\:]', ' ', text)
        
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()

    def calculate_textblob_sentiment(self, text):
        """Calculate sentiment using TextBlob"""
        try:
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity  # -1 (negative) to 1 (positive)
            subjectivity = blob.sentiment.subjectivity  # 0 (objective) to 1 (subjective)
            
            return {
                'polarity': round(polarity, 3),
                'subjectivity': round(subjectivity, 3)
            }
        except:
            return {'polarity': 0.0, 'subjectivity': 0.0}

    def calculate_vader_sentiment(self, text):
        """Calculate sentiment using VADER"""
        try:
            scores = self.vader_analyzer.polarity_scores(text)
            return {
                'compound': round(scores['compound'], 3),
                'positive': round(scores['pos'], 3),
                'neutral': round(scores['neu'], 3),
                'negative': round(scores['neg'], 3)
            }
        except:
            return {'compound': 0.0, 'positive': 0.0, 'neutral': 0.0, 'negative': 0.0}

    def calculate_or_specific_sentiment(self, text):
        """Calculate OR-specific sentiment scores"""
        if not text:
            return {'or_positive_score': 0.0, 'or_negative_score': 0.0, 'or_sentiment_ratio': 0.0}
        
        text_lower = text.lower()
        words = re.findall(r'\b[a-z]+\b', text_lower)
        total_words = len(words)
        
        if total_words == 0:
            return {'or_positive_score': 0.0, 'or_negative_score': 0.0, 'or_sentiment_ratio': 0.0}
        
        # Count OR-specific sentiment words
        positive_count = sum(1 for word in words if word in self.or_positive_words)
        negative_count = sum(1 for word in words if word in self.or_negative_words)
        
        positive_score = round((positive_count / total_words) * 100, 2)
        negative_score = round((negative_count / total_words) * 100, 2)
        sentiment_ratio = round(positive_score - negative_score, 2)
        
        return {
            'or_positive_score': positive_score,
            'or_negative_score': negative_score,
            'or_sentiment_ratio': sentiment_ratio
        }

    def calculate_gender_sentiment_patterns(self, text):
        """Analyze sentiment patterns related to gender"""
        if not text:
            return {
                'masculine_sentiment_count': 0,
                'feminine_sentiment_count': 0,
                'gender_sentiment_ratio': 0.0
            }
        
        text_lower = text.lower()
        
        # Count gender-related sentiment patterns
        masculine_count = sum(len(re.findall(pattern, text_lower)) for pattern in self.masculine_sentiment_patterns)
        feminine_count = sum(len(re.findall(pattern, text_lower)) for pattern in self.feminine_sentiment_patterns)
        
        # Calculate ratio
        total_gender_sentiment = masculine_count + feminine_count
        if total_gender_sentiment > 0:
            ratio = round((masculine_count - feminine_count) / total_gender_sentiment * 100, 2)
        else:
            ratio = 0.0
        
        return {
            'masculine_sentiment_count': masculine_count,
            'feminine_sentiment_count': feminine_count,
            'gender_sentiment_ratio': ratio
        }

    def calculate_comprehensive_sentiment(self, text):
        """Calculate all sentiment metrics for a text"""
        
        if not text:
            return {
                # TextBlob metrics
                'sentiment_polarity': 0.0,
                'sentiment_subjectivity': 0.0,
                
                # VADER metrics
                'vader_compound': 0.0,
                'vader_positive': 0.0,
                'vader_neutral': 0.0,
                'vader_negative': 0.0,
                
                # OR-specific metrics
                'or_positive_score': 0.0,
                'or_negative_score': 0.0,
                'or_sentiment_ratio': 0.0,
                
                # Gender sentiment patterns
                'masculine_sentiment_count': 0,
                'feminine_sentiment_count': 0,
                'gender_sentiment_ratio': 0.0,
                
                # Overall classification
                'sentiment_classification': 'Neutral',
                'sentiment_confidence': 0.0
            }
        
        # Clean text
        clean_text = self.clean_text_for_sentiment(text)
        
        # Calculate different sentiment measures
        textblob_results = self.calculate_textblob_sentiment(clean_text)
        vader_results = self.calculate_vader_sentiment(clean_text)
        or_results = self.calculate_or_specific_sentiment(clean_text)
        gender_results = self.calculate_gender_sentiment_patterns(clean_text)
        
        # Determine overall sentiment classification
        # Use VADER compound score as primary classifier
        compound_score = vader_results['compound']
        
        if compound_score >= 0.05:
            classification = 'Positive'
            confidence = abs(compound_score)
        elif compound_score <= -0.05:
            classification = 'Negative'
            confidence = abs(compound_score)
        else:
            classification = 'Neutral'
            confidence = 1 - abs(compound_score)
        
        # Combine all results
        return {
            # TextBlob metrics
            'sentiment_polarity': textblob_results['polarity'],
            'sentiment_subjectivity': textblob_results['subjectivity'],
            
            # VADER metrics
            'vader_compound': vader_results['compound'],
            'vader_positive': vader_results['positive'],
            'vader_neutral': vader_results['neutral'],
            'vader_negative': vader_results['negative'],
            
            # OR-specific metrics
            'or_positive_score': or_results['or_positive_score'],
            'or_negative_score': or_results['or_negative_score'],
            'or_sentiment_ratio': or_results['or_sentiment_ratio'],
            
            # Gender sentiment patterns
            'masculine_sentiment_count': gender_results['masculine_sentiment_count'],
            'feminine_sentiment_count': gender_results['feminine_sentiment_count'],
            'gender_sentiment_ratio': gender_results['gender_sentiment_ratio'],
            
            # Overall classification
            'sentiment_classification': classification,
            'sentiment_confidence': round(confidence, 3)
        }

    def add_sentiment_analysis_to_dataset(self, df):
        """Add comprehensive sentiment analysis to the existing dataset"""
        
        print("\n" + "="*70)
        print("ADDING SENTIMENT ANALYSIS TO OR JOBS DATASET")
        print("="*70)
        
        # Identify text column for analysis
        text_column = None
        possible_text_columns = ['job_description', 'description', 'combined_text_for_analysis', 'job_title']
        
        for col in possible_text_columns:
            if col in df.columns and df[col].notna().sum() > 0:
                text_column = col
                break
        
        if text_column is None:
            raise ValueError("No suitable text column found for sentiment analysis")
        
        print(f"📝 Analyzing sentiment from column: {text_column}")
        print(f"📊 Processing {len(df):,} job descriptions...")
        
        # Apply sentiment analysis
        sentiment_results = df[text_column].fillna('').apply(self.calculate_comprehensive_sentiment)
        
        # Extract results into separate columns
        sentiment_metrics = [
            'sentiment_polarity', 'sentiment_subjectivity', 'vader_compound',
            'vader_positive', 'vader_neutral', 'vader_negative',
            'or_positive_score', 'or_negative_score', 'or_sentiment_ratio',
            'masculine_sentiment_count', 'feminine_sentiment_count', 
            'gender_sentiment_ratio', 'sentiment_classification', 'sentiment_confidence'
        ]
        
        for metric in sentiment_metrics:
            df[f'{metric}'] = [result[metric] for result in sentiment_results]
        
        # Add analysis metadata
        df['sentiment_analysis_date'] = datetime.now().strftime('%Y-%m-%d')
        df['sentiment_text_column'] = text_column
        
        print("✅ Sentiment analysis complete!")
        
        return df

    def create_validation_analysis(self, df):
        """Create analysis comparing sentiment and lexicon scores for validation"""
        
        print("\n" + "="*70)
        print("VALIDATION ANALYSIS: SENTIMENT vs LEXICON SCORES")
        print("="*70)
        
        validation_results = {}
        
        # Check if lexicon columns exist
        lexicon_cols = ['lexicon_gender_classification', 'lexicon_gender_ratio', 
                       'lexicon_masculine_score', 'lexicon_feminine_score']
        
        has_lexicon = all(col in df.columns for col in lexicon_cols)
        
        if has_lexicon:
            print("🔍 Cross-validating sentiment and lexicon analyses...")
            
            # 1. Correlation between sentiment and gender scores
            correlations = {}
            
            # Vader compound vs gender ratio
            if 'vader_compound' in df.columns and 'lexicon_gender_ratio' in df.columns:
                corr = df['vader_compound'].corr(df['lexicon_gender_ratio'])
                correlations['Sentiment vs Gender Ratio'] = round(corr, 3)
            
            # OR sentiment ratio vs lexicon gender ratio
            if 'or_sentiment_ratio' in df.columns and 'lexicon_gender_ratio' in df.columns:
                corr = df['or_sentiment_ratio'].corr(df['lexicon_gender_ratio'])
                correlations['OR Sentiment vs Lexicon Gender'] = round(corr, 3)
            
            # Gender sentiment vs lexicon scores
            if 'gender_sentiment_ratio' in df.columns and 'lexicon_gender_ratio' in df.columns:
                corr = df['gender_sentiment_ratio'].corr(df['lexicon_gender_ratio'])
                correlations['Gender Sentiment vs Lexicon'] = round(corr, 3)
            
            validation_results['correlations'] = correlations
            
            # 2. Agreement analysis
            if 'sentiment_classification' in df.columns and 'lexicon_gender_classification' in df.columns:
                # Create agreement matrix
                agreement_matrix = pd.crosstab(
                    df['sentiment_classification'], 
                    df['lexicon_gender_classification']
                )
                validation_results['agreement_matrix'] = agreement_matrix
                
                # Calculate agreement percentage
                # Convert to comparable categories
                sentiment_positive = df['sentiment_classification'] == 'Positive'
                lexicon_masculine = df['lexicon_gender_classification'] == 'Masculine-coded'
                
                sentiment_negative = df['sentiment_classification'] == 'Negative'
                lexicon_feminine = df['lexicon_gender_classification'] == 'Feminine-coded'
                
                positive_agreement = (sentiment_positive & lexicon_masculine).sum()
                negative_agreement = (sentiment_negative & lexicon_feminine).sum()
                total_agreement = positive_agreement + negative_agreement
                agreement_percentage = (total_agreement / len(df)) * 100
                
                validation_results['agreement_percentage'] = round(agreement_percentage, 1)
            
            # 3. Outlier analysis - jobs with conflicting scores
            conflicting_jobs = []
            
            if ('vader_compound' in df.columns and 'lexicon_gender_ratio' in df.columns):
                # Find jobs with positive sentiment but high masculine coding
                positive_sentiment_masculine = df[
                    (df['vader_compound'] > 0.1) & 
                    (df['lexicon_gender_ratio'] > 2.0)
                ]
                
                # Find jobs with negative sentiment but high feminine coding
                negative_sentiment_feminine = df[
                    (df['vader_compound'] < -0.1) & 
                    (df['lexicon_gender_ratio'] < -2.0)
                ]
                
                validation_results['conflicting_positive_masculine'] = len(positive_sentiment_masculine)
                validation_results['conflicting_negative_feminine'] = len(negative_sentiment_feminine)
            
            # Print validation results
            print(f"📈 Correlation Analysis:")
            for metric, corr in correlations.items():
                print(f"   {metric}: {corr}")
            
            if 'agreement_percentage' in validation_results:
                print(f"\n🎯 Agreement Analysis:")
                print(f"   Overall Agreement: {validation_results['agreement_percentage']}%")
            
            print(f"\n⚠️  Conflicting Cases:")
            print(f"   Positive sentiment + Masculine coding: {validation_results.get('conflicting_positive_masculine', 0)}")
            print(f"   Negative sentiment + Feminine coding: {validation_results.get('conflicting_negative_feminine', 0)}")
        
        else:
            print("⚠️  No lexicon scores found. Skipping validation analysis.")
            validation_results['message'] = "No lexicon data available for validation"
        
        return validation_results

    def generate_sentiment_summary(self, df):
        """Generate comprehensive sentiment analysis summary"""
        
        print(f"\n" + "="*70)
        print("SENTIMENT ANALYSIS SUMMARY")
        print("="*70)
        
        # Basic sentiment distribution
        if 'sentiment_classification' in df.columns:
            print(f"📊 Sentiment Classification:")
            sentiment_counts = df['sentiment_classification'].value_counts()
            for sentiment, count in sentiment_counts.items():
                percentage = (count / len(df)) * 100
                print(f"   {sentiment}: {count:,} ({percentage:.1f}%)")
        
        # Average sentiment scores
        sentiment_metrics = ['sentiment_polarity', 'vader_compound', 'or_sentiment_ratio']
        available_metrics = [col for col in sentiment_metrics if col in df.columns]
        
        if available_metrics:
            print(f"\n📈 Average Sentiment Scores:")
            for metric in available_metrics:
                avg_score = df[metric].mean()
                print(f"   {metric}: {avg_score:.3f}")
        
        # OR-specific sentiment
        if 'or_positive_score' in df.columns and 'or_negative_score' in df.columns:
            print(f"\n🎯 OR-Specific Sentiment:")
            print(f"   Average positive score: {df['or_positive_score'].mean():.2f}%")
            print(f"   Average negative score: {df['or_negative_score'].mean():.2f}%")
            print(f"   Average sentiment ratio: {df['or_sentiment_ratio'].mean():.2f}")
        
        # Gender sentiment patterns
        if 'gender_sentiment_ratio' in df.columns:
            print(f"\n⚖️  Gender Sentiment Patterns:")
            print(f"   Average gender sentiment ratio: {df['gender_sentiment_ratio'].mean():.2f}")
            
            masculine_dominant = (df['gender_sentiment_ratio'] > 10).sum()
            feminine_dominant = (df['gender_sentiment_ratio'] < -10).sum()
            
            print(f"   Jobs with masculine sentiment patterns: {masculine_dominant}")
            print(f"   Jobs with feminine sentiment patterns: {feminine_dominant}")
        
        return df

def main():
    """Main function to add sentiment analysis to existing lexicon-scored dataset"""
    
    # File path to your existing dataset
    input_path = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\Final Dataset.xlsx"
    
    # Output path (same file, updated)
    output_path = r"C:\Users\HP\OneDrive - University of Southampton\Documents\Dissertation Project - Marwa Ashfaq\Dataset\Final Dataset.xlsx"
    
    try:
        print("="*70)
        print("OR JOBS SENTIMENT ANALYSIS - COMPLEMENTING LEXICON SCORES")
        print("="*70)
        print("This will add comprehensive sentiment analysis to your existing")
        print("lexicon-scored dataset for validation and deeper insights.")
        print("="*70)
        
        # Read existing dataset
        print(f"📁 Reading existing dataset...")
        df = pd.read_excel(input_path, sheet_name='All_Jobs_Combined')
        print(f"✅ Loaded {len(df):,} jobs with {len(df.columns)} columns")
        
        # Initialize sentiment analyzer
        analyzer = ORJobSentimentAnalyzer()
        
        # Add sentiment analysis
        df_with_sentiment = analyzer.add_sentiment_analysis_to_dataset(df)
        
        # Create validation analysis
        validation_results = analyzer.create_validation_analysis(df_with_sentiment)
        
        # Generate summary
        final_df = analyzer.generate_sentiment_summary(df_with_sentiment)
        
        # Save updated dataset
        print(f"\n💾 Saving updated dataset with sentiment analysis...")
        
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            # Main dataset with both lexicon and sentiment
            final_df.to_excel(writer, sheet_name='Jobs_Lexicon_and_Sentiment', index=False)
            
            # Validation analysis
            if validation_results and 'correlations' in validation_results:
                validation_df = pd.DataFrame([
                    ['Metric', 'Correlation'],
                    *[[k, v] for k, v in validation_results['correlations'].items()]
                ])
                validation_df.to_excel(writer, sheet_name='Validation_Analysis', index=False, header=False)
            
            # Sentiment summary
            if 'sentiment_classification' in final_df.columns:
                sentiment_summary = final_df['sentiment_classification'].value_counts().reset_index()
                sentiment_summary.columns = ['Sentiment', 'Count']
                sentiment_summary['Percentage'] = (sentiment_summary['Count'] / len(final_df) * 100).round(1)
                sentiment_summary.to_excel(writer, sheet_name='Sentiment_Summary', index=False)
            
            # Combined analysis (lexicon + sentiment)
            if 'lexicon_gender_classification' in final_df.columns and 'sentiment_classification' in final_df.columns:
                combined_analysis = pd.crosstab(
                    final_df['lexicon_gender_classification'],
                    final_df['sentiment_classification']
                ).reset_index()
                combined_analysis.to_excel(writer, sheet_name='Lexicon_vs_Sentiment', index=False)
        
        print(f"✅ Updated dataset saved to: {output_path}")
        print(f"📊 Total columns now: {len(final_df.columns)}")
        
        # Show sample of new sentiment columns
        sentiment_cols = [col for col in final_df.columns if 'sentiment' in col.lower() or 'vader' in col.lower()]
        if sentiment_cols:
            print(f"\n📋 New Sentiment Columns Added:")
            for col in sentiment_cols[:10]:  # Show first 10
                print(f"   {col}")
        
        print(f"\n🎉 SENTIMENT ANALYSIS COMPLETE!")
        print(f"Your dataset now has both lexicon and sentiment analysis for")
        print(f"comprehensive validation and deeper insights into gendered language!")
        
        return final_df
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

if __name__ == "__main__":
    # Check and install required packages
    print("Checking required packages...")
    
    packages_to_install = []
    
    try:
        import textblob
    except ImportError:
        packages_to_install.append("textblob")
    
    try:
        import vaderSentiment
    except ImportError:
        packages_to_install.append("vaderSentiment")
    
    if packages_to_install:
        print(f"❌ Missing packages: {', '.join(packages_to_install)}")
        print("Please run these commands in your terminal:")
        print("   pip install textblob vaderSentiment nltk")
        print("   python -m textblob.download_corpora")
        print("\nThen run this script again.")
    else:
        print("✅ All required packages available")
        main()

Checking required packages...
✅ All required packages available
OR JOBS SENTIMENT ANALYSIS - COMPLEMENTING LEXICON SCORES
This will add comprehensive sentiment analysis to your existing
lexicon-scored dataset for validation and deeper insights.
📁 Reading existing dataset...
✅ Loaded 1,233 jobs with 22 columns

ADDING SENTIMENT ANALYSIS TO OR JOBS DATASET
📝 Analyzing sentiment from column: job_description
📊 Processing 1,233 job descriptions...
✅ Sentiment analysis complete!

VALIDATION ANALYSIS: SENTIMENT vs LEXICON SCORES
🔍 Cross-validating sentiment and lexicon analyses...
📈 Correlation Analysis:
   Sentiment vs Gender Ratio: 0.018
   OR Sentiment vs Lexicon Gender: 0.102
   Gender Sentiment vs Lexicon: 0.261

🎯 Agreement Analysis:
   Overall Agreement: 22.4%

⚠️  Conflicting Cases:
   Positive sentiment + Masculine coding: 99
   Negative sentiment + Feminine coding: 0

SENTIMENT ANALYSIS SUMMARY
📊 Sentiment Classification:
   Positive: 1,227 (99.5%)
   Negative: 4 (0.3%)
   Neutral: 