In [6]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [7]:
df = pd.read_csv('salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
1,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
2,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
3,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
4,2025,SE,FT,Engineer,143000,USD,143000,US,0,US,M


In [8]:
df = df.drop(['salary', 'salary_currency'], axis=1)


In [10]:
#Convert salary to SAR
if 'salary_in_usd' in df.columns:
    usd_to_sar_rate = 3.75
    df['salary'] = df['salary_in_usd'] * usd_to_sar_rate
    df = df.drop('salary_in_usd', axis=1)
else:
    print("'salary_in_usd' column not found in DataFrame.")


In [12]:
#Columns to encode
columns_to_encode = ['experience_level', 'employment_type', 'job_title', 'employee_residence', 'company_location', 'company_size']

#Add encoded columns and rearrange
for col in columns_to_encode:
    encoded_col = col + '_encoded'
    df[encoded_col] = df[col].astype('category').cat.codes

#Get the current column order
    cols = df.columns.tolist()

#Move the encoded column next to the original column
    col_index = cols.index(col)
    cols.insert(col_index + 1, cols.pop(cols.index(encoded_col)))
    df = df[cols]

df

Unnamed: 0,work_year,experience_level,experience_level_encoded,employment_type,employment_type_encoded,job_title,job_title_encoded,employee_residence,employee_residence_encoded,remote_ratio,company_location,company_location_encoded,company_size,company_size_encoded,salary
0,2025,SE,3,FT,2,Data Product Owner,133,US,88,0,US,82,M,1,637500.00
1,2025,SE,3,FT,2,Data Product Owner,133,US,88,0,US,82,M,1,412500.00
2,2025,SE,3,FT,2,Data Product Owner,133,US,88,0,US,82,M,1,637500.00
3,2025,SE,3,FT,2,Data Product Owner,133,US,88,0,US,82,M,1,412500.00
4,2025,SE,3,FT,2,Engineer,169,US,88,0,US,82,M,1,536250.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73143,2020,SE,3,FT,2,Data Scientist,142,US,88,100,US,82,L,0,1545000.00
73144,2021,MI,2,FT,2,Principal Data Scientist,236,US,88,100,US,82,L,0,566250.00
73145,2020,EN,0,FT,2,Data Scientist,142,US,88,100,US,82,S,2,393750.00
73146,2020,EN,0,CT,0,Business Data Analyst,57,US,88,100,US,82,L,0,375000.00


In [14]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73148 entries, 0 to 73147
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   work_year                   73148 non-null  int64  
 1   experience_level            73148 non-null  object 
 2   experience_level_encoded    73148 non-null  int8   
 3   employment_type             73148 non-null  object 
 4   employment_type_encoded     73148 non-null  int8   
 5   job_title                   73148 non-null  object 
 6   job_title_encoded           73148 non-null  int16  
 7   employee_residence          73148 non-null  object 
 8   employee_residence_encoded  73148 non-null  int8   
 9   remote_ratio                73148 non-null  int64  
 10  company_location            73148 non-null  object 
 11  company_location_encoded    73148 non-null  int8   
 12  company_size                73148 non-null  object 
 13  company_size_encoded        731

Unnamed: 0,work_year,experience_level_encoded,employment_type_encoded,job_title_encoded,employee_residence_encoded,remote_ratio,company_location_encoded,company_size_encoded,salary
count,73148.0,73148.0,73148.0,73148.0,73148.0,73148.0,73148.0,73148.0,73148.0
mean,2023.831192,2.378384,1.9981,151.118158,82.086851,21.582955,76.57154,0.969897,592551.6
std,0.477551,0.916602,0.103709,75.977281,18.843257,41.023051,17.400197,0.186555,271879.9
min,2020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56250.0
25%,2024.0,2.0,2.0,95.0,88.0,0.0,82.0,1.0,400837.5
50%,2024.0,3.0,2.0,142.0,88.0,0.0,82.0,1.0,553125.0
75%,2024.0,3.0,2.0,207.0,88.0,0.0,82.0,1.0,748875.0
max,2025.0,3.0,3.0,288.0,92.0,100.0,85.0,2.0,3000000.0


In [15]:
print("Data dimensions:", df.shape)

null_counts = df.isnull().sum()
print("Missing values per column:\n", null_counts)

df = df.dropna()

Data dimensions: (73148, 15)
Missing values per column:
 work_year                     0
experience_level              0
experience_level_encoded      0
employment_type               0
employment_type_encoded       0
job_title                     0
job_title_encoded             0
employee_residence            0
employee_residence_encoded    0
remote_ratio                  0
company_location              0
company_location_encoded      0
company_size                  0
company_size_encoded          0
salary                        0
dtype: int64
