In [1]:
# Step 1: Import required library
import pandas as pd

# Step 2: Load the dataset
df = pd.read_csv("Software Engineer Salaries.csv")

# Step 3: See the shape and first few rows
print("Rows & Columns:", df.shape)
df.head()


Rows & Columns: (870, 6)


Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.)
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.)
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.)
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.)
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.)


In [2]:
# Step 4: Check missing data
df.isnull().sum()


Company            2
Company Score     81
Job Title          0
Location          13
Date               0
Salary           106
dtype: int64

In [3]:
# Step 5: Remove duplicate rows (same job repeated)
print("Before removing duplicates:", df.shape)
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


Before removing duplicates: (870, 6)
After removing duplicates: (870, 6)


In [4]:
import re
import numpy as np

# Step 6: Create a new clean salary column
def clean_salary(s):
    if pd.isna(s):
        return np.nan
    s = str(s)
    # 1. Remove extra text like (Glassdoor est.)
    s = re.sub(r'\(.*?\)', '', s)
    # 2. Remove $ and commas
    s = s.replace('$', '').replace(',', '').strip()
    # 3. Remove non-breaking spaces
    s = s.replace('\xa0', '')
    # 4. Split if it's a range like "68K - 94K"
    parts = s.split('-')
    nums = []
    for p in parts:
        p = p.strip()
        match = re.search(r'(\d+)([kK]?)', p)
        if match:
            val = float(match.group(1))
            if match.group(2).lower() == 'k':
                val *= 1000
            nums.append(val)
    # Take average of range or single value
    if len(nums) == 1:
        return nums[0]
    elif len(nums) == 2:
        return sum(nums) / 2
    else:
        return np.nan

df['Cleaned_Salary'] = df['Salary'].apply(clean_salary)
df[['Salary', 'Cleaned_Salary']].head(10)


Unnamed: 0,Salary,Cleaned_Salary
0,$68K - $94K (Glassdoor est.),81000.0
1,$61K - $104K (Employer est.),82500.0
2,$95K - $118K (Glassdoor est.),106500.0
3,$97K - $145K (Employer est.),121000.0
4,$85K - $108K (Glassdoor est.),96500.0
5,$123K - $175K (Employer est.),149000.0
6,$77K - $94K (Glassdoor est.),85500.0
7,$71K - $100K (Glassdoor est.),85500.0
8,$94K - $148K (Glassdoor est.),121000.0
9,$147K - $189K (Employer est.),168000.0


In [5]:
# Step 7: Replace missing Location or Job Title with defaults
df['Location'] = df['Location'].fillna('Unknown')
df['Job Title'] = df['Job Title'].fillna('Not Mentioned')


In [6]:
# Step 8: Verify cleaned salary
print(df['Cleaned_Salary'].describe())


count       764.000000
mean     125179.323953
std       51226.035383
min          15.000000
25%       97000.000000
50%      116500.000000
75%      147500.000000
max      490000.000000
Name: Cleaned_Salary, dtype: float64


In [7]:
# Step 9: Create Experience Level based on salary range
def experience_level(salary):
    if pd.isna(salary):
        return 'Unknown'
    elif salary < 70000:
        return 'Junior'
    elif salary < 120000:
        return 'Mid Level'
    else:
        return 'Senior'

df['Experience_Level'] = df['Cleaned_Salary'].apply(experience_level)
df[['Cleaned_Salary', 'Experience_Level']].head(10)


Unnamed: 0,Cleaned_Salary,Experience_Level
0,81000.0,Mid Level
1,82500.0,Mid Level
2,106500.0,Mid Level
3,121000.0,Senior
4,96500.0,Mid Level
5,149000.0,Senior
6,85500.0,Mid Level
7,85500.0,Mid Level
8,121000.0,Senior
9,168000.0,Senior


In [8]:
# Step 10: Group by location to find average salaries
avg_salary = df.groupby('Location')['Cleaned_Salary'].mean().reset_index()
avg_salary.head()


Unnamed: 0,Location,Cleaned_Salary
0,"Aberdeen Proving Ground, MD",132500.0
1,"Agoura Hills, CA",80000.0
2,"Aguadilla, PR",113000.0
3,"Aiken, SC",90000.0
4,Alabama,89500.0


In [9]:
# Step 11: Average salary by experience
exp_summary = df.groupby('Experience_Level')['Cleaned_Salary'].mean().reset_index()
exp_summary


Unnamed: 0,Experience_Level,Cleaned_Salary
0,Junior,20564.192308
1,Mid Level,98793.38843
2,Senior,162908.839779
3,Unknown,


In [10]:
# Step 12: Save the cleaned dataset
df.to_csv("Cleaned_Software_Engineer_Salaries.csv", index=False)
print("✅ Cleaned dataset saved successfully!")


✅ Cleaned dataset saved successfully!
