In [1]:
# Reading in dataset
import pandas as pd

df = pd.read_csv("AI-ML Salary Dataset.csv")
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2022,MI,FT,Machine Learning Engineer,130000,USD,130000,US,0,US,M
1,2022,MI,FT,Machine Learning Engineer,90000,USD,90000,US,0,US,M
2,2022,MI,FT,Data Scientist,120000,USD,120000,US,100,US,M
3,2022,MI,FT,Data Scientist,100000,USD,100000,US,100,US,M
4,2022,MI,FT,Data Scientist,85000,USD,85000,US,100,US,M


# Checking for missing values

In [2]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

## There are no missing values, so great!

In [3]:
# Encoding string fields

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["job_title"] = le.fit_transform(df["job_title"])
df["employee_residence"] = le.fit_transform(df["employee_residence"])
df["company_location"] = le.fit_transform(df["company_location"])
df["company_size"] = le.fit_transform(df["company_size"])
df["experience_level"] = le.fit_transform(df["experience_level"])
df["employment_type"] = le.fit_transform(df["employment_type"])

df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2022,2,2,47,130000,USD,130000,62,0,57,1
1,2022,2,2,47,90000,USD,90000,62,0,57,1
2,2022,2,2,30,120000,USD,120000,62,100,57,1
3,2022,2,2,30,100000,USD,100000,62,100,57,1
4,2022,2,2,30,85000,USD,85000,62,100,57,1


# Checking and sorting out data types

In [4]:
df.dtypes

work_year              int64
experience_level       int64
employment_type        int64
job_title              int64
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence     int64
remote_ratio           int64
company_location       int64
company_size           int64
dtype: object

In [5]:
df["work_year"] = df["work_year"].astype("category")
df["experience_level"] = df["experience_level"].astype("category")
df["employment_type"] = df["employment_type"].astype("category")
df["job_title"] = df["job_title"].astype("category")
df["employee_residence"] = df["employee_residence"].astype("category")
df["salary_in_usd"] = df["salary_in_usd"].astype("float64")
df["remote_ratio"] = df["remote_ratio"].astype("float64")
df["company_location"] = df["company_location"].astype("category")
df["company_size"] = df["company_size"].astype("category")

df.dtypes

work_year             category
experience_level      category
employment_type       category
job_title             category
salary                   int64
salary_currency         object
salary_in_usd          float64
employee_residence    category
remote_ratio           float64
company_location      category
company_size          category
dtype: object

# Normalising and splitting data

In [32]:
from sklearn.model_selection import train_test_split

# Defining X and y
X = df[["work_year", "experience_level", "employment_type", "job_title", "employee_residence", "remote_ratio", "company_location",
        "company_size"]]
y = df["salary_in_usd"]

# Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
y_scaled = scaler.fit_transform([y])

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled.reshape(-1,1), test_size = 0.4)

# Building the model

In [33]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

1.0