In [32]:
import pandas as pd
import numpy as np

df = pd.read_csv('top_trending_jobs.csv')

# Clean company names
df['Company'] = df['Company'].str.replace('\n', ' ', regex=False).str.strip()

# Clean salary column

# Use regex to extract numbers from the salary string
# It handles formats like "$X - $Y" and "$X or more"
salary_extracted = df['Salary'].str.extract(r'\$(\d{1,3}(?:,\d{3})*)\s*(?:-\s*\$(\d{1,3}(?:,\d{3})*))?.*')

# Rename the new columns
salary_extracted.columns = ['min_salary', 'max_salary']

# Convert extracted columns to numeric, removing commas first
for col in salary_extracted.columns:
    salary_extracted[col] = salary_extracted[col].str.replace(',', '', regex=False)
    salary_extracted[col] = pd.to_numeric(salary_extracted[col], errors='coerce')

# For entries like "$100,000 or more", max_salary will be NaN.
# We can fill the max_salary with the min_salary value in these cases.
salary_extracted['max_salary'] = salary_extracted['max_salary'].fillna(salary_extracted['min_salary'])


# Add the new numeric salary columns to the main DataFrame
df = pd.concat([df, salary_extracted], axis=1)

df['average_salary'] = df[['min_salary', 'max_salary']].mean(axis=1)

df.to_csv('cleaned_data', index=False)