In [1]:
import pandas as pd 
import numpy as np

In [2]:
def process_salary_data(input_file='cleaned_salary.csv', output_file='final_enhanced_salary_dataset.csv'):
    # 1. Load Data
    df = pd.read_csv(input_file)
    
    # 2. Skill Mapping Function
    def get_skills(job_title):
        title_lower = str(job_title).lower()
        skills = set()
        
        # Comprehensive mapping based on common roles
        mappings = {
            'software': ['Python', 'Java', 'System Design', 'Git', 'Agile', 'Cloud Computing', 'CI/CD'],
            'developer': ['Coding', 'Debugging', 'API Integration', 'Git', 'Problem Solving', 'Data Structures'],
            'engineer': ['Technical Architecture', 'Mathematics', 'System Design', 'Testing'],
            'web': ['HTML/CSS', 'JavaScript', 'React/Angular', 'Responsive Design', 'UI/UX Basics'],
            'data': ['SQL', 'Python/R', 'Data Visualization', 'Statistics', 'Machine Learning', 'ETL'],
            'scientist': ['Research', 'Hypothesis Testing', 'Deep Learning', 'Modeling', 'Big Data'],
            'analyst': ['Excel', 'SQL', 'Tableau/PowerBI', 'Data Interpretation', 'Reporting'],
            'manager': ['Leadership', 'Team Management', 'Budgeting', 'Strategic Planning', 'Mentoring', 'Conflict Resolution'],
            'director': ['Executive Leadership', 'Strategy', 'Business Development', 'Financial Oversight', 'Operations Management'],
            'vp': ['Organizational Strategy', 'Executive Decision Making', 'Global Operations', 'P&L Management'],
            'sales': ['CRM (Salesforce)', 'Negotiation', 'Lead Generation', 'Client Relations', 'Closing'],
            'marketing': ['SEO/SEM', 'Content Strategy', 'Social Media', 'Google Analytics', 'Branding', 'Campaign Management'],
            'hr': ['Recruiting', 'Employee Relations', 'HRIS', 'Compliance', 'Onboarding', 'Talent Acquisition'],
            'finance': ['Financial Modeling', 'Forecasting', 'Risk Management', 'Valuation', 'Accounting'],
            'product': ['Product Lifecycle', 'User Research', 'Roadmapping', 'Agile/Scrum', 'Market Analysis'],
            'project': ['Project Planning', 'Risk Management', 'Stakeholder Management', 'Jira', 'Time Management'],
            'design': ['Adobe Creative Suite', 'Visual Design', 'Creativity', 'Layout', 'Typography'],
            'ux': ['Wireframing', 'Prototyping', 'User Testing', 'Figma', 'User Centered Design'],
            'operations': ['Process Improvement', 'Logistics', 'Inventory Management', 'Efficiency', 'Supply Chain'],
            'support': ['Troubleshooting', 'Customer Service', 'Ticketing Systems', 'Empathy', 'Technical Support'],
        }

        found_match = False
        for keyword, skill_list in mappings.items():
            if keyword in title_lower:
                skills.update(skill_list)
                found_match = True

        # Default soft skills for everyone
        if not found_match:
             skills.update(['Communication', 'Time Management', 'Problem Solving'])
        skills.update(['Communication', 'Teamwork']) # Universal skills
        
        return ", ".join(sorted(list(skills)))

    df['Skills'] = df['Job Title'].apply(get_skills)

    # 3. Market Benchmarking (Min, Max, Median)
    salary_stats = df.groupby('Job Title')['Salary'].agg(['min', 'max', 'median']).reset_index()
    salary_stats.rename(columns={'min': 'Market_Min', 'max': 'Market_Max', 'median': 'Market_Median'}, inplace=True)
    df = df.merge(salary_stats, on='Job Title', how='left')

    # 4. "Next Role" Prediction (Heuristic)
    def predict_next_role(title):
        title = str(title).strip()
        lower_title = title.lower()
        
        if 'junior' in lower_title:
            return title.replace('Junior', 'Senior').replace('junior', 'Senior')
        elif 'senior' in lower_title and 'manager' not in lower_title:
            return title + " Manager"
        elif 'manager' in lower_title and 'senior' not in lower_title:
            return "Senior " + title
        elif 'manager' in lower_title and 'senior' in lower_title:
            return "Director of " + title.replace('Senior ', '').replace('Manager', '')
        elif 'director' in lower_title:
            return "VP of " + title.replace('Director of ', '').replace('Director', '')
        elif 'associate' in lower_title:
            return title.replace('Associate', 'Manager')
        elif 'analyst' in lower_title and 'senior' not in lower_title:
            return "Senior " + title
        else:
            return "Senior " + title

    df['Next_Role'] = df['Job Title'].apply(predict_next_role)

    # 5. Generate Rich Text Description
    def generate_full_description(row):
        # Salary context logic
        if row['Salary'] >= row['Market_Median']:
            salary_status = "above the market median"
        else:
            salary_status = "below the market median"
            
        market_range = f"${row['Market_Min']:,.0f} - ${row['Market_Max']:,.0f}"

        return (
            f"Job Title: {row['Job Title']}. "
            f"Profile: {row['Age']:.0f}-year-old {row['Gender']} with a {row['Education Level']}. "
            f"Experience: {row['Years of Experience']} years. "
            f"Current Salary: ${row['Salary']:,.0f} ({salary_status}). "
            f"Market Benchmark: The typical salary range for a {row['Job Title']} is {market_range}, with a median of ${row['Market_Median']:,.0f}. "
            f"Skills Required: {row['Skills']}. "
            f"Career Path: A typical next step for this role is becoming a {row['Next_Role']}."
        )

    df['text_description'] = df.apply(generate_full_description, axis=1)

    # 6. Save
    df.to_csv(output_file, index=False)
    print(f"Successfully processed {len(df)} rows and saved to {output_file}")
    return df

# Execute the function
final_df = process_salary_data()
print(final_df[['Job Title', 'Next_Role', 'text_description']].head().values)

Successfully processed 1764 rows and saved to final_enhanced_salary_dataset.csv
[['Software Engineer' 'Senior Software Engineer'
  "Job Title: Software Engineer. Profile: 32-year-old Male with a Bachelor's Degree. Experience: 5.0 years. Current Salary: $90,000 (above the market median). Market Benchmark: The typical salary range for a Software Engineer is $50,000 - $197,000, with a median of $87,500. Skills Required: Agile, CI/CD, Cloud Computing, Communication, Git, Java, Mathematics, Python, System Design, Teamwork, Technical Architecture, Testing. Career Path: A typical next step for this role is becoming a Senior Software Engineer."]
 ['Data Analyst' 'Senior Data Analyst'
  "Job Title: Data Analyst. Profile: 28-year-old Female with a Master's Degree. Experience: 3.0 years. Current Salary: $65,000 (below the market median). Market Benchmark: The typical salary range for a Data Analyst is $65,000 - $195,000, with a median of $115,000. Skills Required: Communication, Data Interpretati

In [22]:
data = pd.read_csv("final_enhanced_salary_dataset.csv",encoding="latin1")

In [23]:
data

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Age_cat,Years_Exper,Sal_Cat,Skills,Market_Min,Market_Max,Market_Median,Next_Role,text_description
0,0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0,Thirties,0-5,Medium,"Agile, CI/CD, Cloud Computing, Communication, ...",50000.0,197000.0,87500.0,Senior Software Engineer,Job Title: Software Engineer. Profile: 32-year...
1,1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0,Twenties,0-5,Medium,"Communication, Data Interpretation, Data Visua...",65000.0,195000.0,115000.0,Senior Data Analyst,Job Title: Data Analyst. Profile: 28-year-old ...
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,Forties,11-15,High,"Budgeting, Communication, Conflict Resolution,...",150000.0,170000.0,160000.0,Director of,Job Title: Senior Manager. Profile: 45-year-ol...
3,3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0,Thirties,6-10,Medium,"CRM (Salesforce), Client Relations, Closing, C...",25000.0,60000.0,35000.0,Sales Manager,Job Title: Sales Associate. Profile: 36-year-o...
4,4,52.0,Male,Master's Degree,Director,20.0,200000.0,Fifties,16-20,Very High,"Business Development, Communication, Executive...",200000.0,200000.0,200000.0,VP of,Job Title: Director. Profile: 52-year-old Male...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1759,6623,43.0,Female,Master's Degree,Digital Marketing Manager,15.0,150000.0,Forties,11-15,High,"Branding, Budgeting, Campaign Management, Comm...",30000.0,150000.0,50000.0,Senior Digital Marketing Manager,Job Title: Digital Marketing Manager. Profile:...
1760,6624,27.0,Male,High School,Sales Manager,2.0,40000.0,Twenties,0-5,Low,"Budgeting, CRM (Salesforce), Client Relations,...",35000.0,180000.0,80000.0,Senior Sales Manager,Job Title: Sales Manager. Profile: 27-year-old...
1761,6625,33.0,Female,Bachelor's Degree,Director of Marketing,8.0,80000.0,Thirties,6-10,Medium,"Branding, Business Development, Campaign Manag...",80000.0,200000.0,170000.0,VP of Marketing,Job Title: Director of Marketing. Profile: 33-...
1762,6628,37.0,Male,Bachelor's Degree,Sales Director,7.0,90000.0,Thirties,6-10,Medium,"Business Development, CRM (Salesforce), Client...",27.0,180000.0,90000.0,VP of Sales,Job Title: Sales Director. Profile: 37-year-ol...
