In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib
import re

In [15]:
df = pd.read_csv("2022_kaggle_survey_results_public.csv", low_memory=False)
print(df.shape)
df.head()

(23997, 9)


Unnamed: 0,age,gender,country,highest_deg,code_experience,current_title,industry,company_size,annual_comp
0,30-34,Man,India,,,,,,
1,30-34,Man,Algeria,Master’s degree,1-3 years,,,,
2,18-21,Man,Egypt,Bachelor’s degree,1-3 years,,,,
3,55-59,Man,France,Some college/university study without earning ...,10-20 years,Data Scientist,Online Service/Internet-based Services,0-49 employees,"25,000-29,999"
4,45-49,Man,India,Bachelor’s degree,5-10 years,,,,


In [16]:
def salary_to_number(val):
    if pd.isna(val):
        return None

    # make it a string
    s = str(val).strip()

    # remove leading $
    if s.startswith("$"):
        s = s[1:]

    # remove spaces
    s = s.replace(" ", "")

    # if it's a range like 25000-29999 or 25,000-29,999
    if "-" in s:
        # remove commas
        s = s.replace(",", "")
        low, high = s.split("-")
        try:
            low = float(low)
            high = float(high)
            return (low + high) / 2
        except:
            return None

    # if it's just a number with commas, like 50,000
    s = s.replace(",", "")
    # some rows might be just "0"
    try:
        return float(s)
    except:
        return None

# apply it
df["annual_comp"] = df["annual_comp"].apply(salary_to_number)

# drop rows where we still couldn't get a number
df = df.dropna(subset=["annual_comp"]).copy()

print(df.shape)
df[["annual_comp"]].head(20)

(8113, 9)


Unnamed: 0,annual_comp
3,27499.5
7,112499.5
8,112499.5
13,224999.5
16,224999.5
17,174999.5
18,94999.5
19,34999.5
20,34999.5
25,34999.5


In [17]:
cols_to_encode = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size"
]

for col in cols_to_encode:
    df.loc[:, col] = df[col].astype("category").cat.codes

  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes
  df.loc[:, col] = df[col].astype("category").cat.codes


In [18]:
feature_cols = [
    "age",
    "gender",
    "country",
    "highest_deg",
    "code_experience",
    "current_title",
    "company_size"
]

X = df[feature_cols]
y = df["annual_comp"]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

In [20]:
sample_input = {
    "age": 7,
    "gender": 0,
    "country": 55,
    "highest_deg": 3,
    "code_experience": 4,
    "current_title": 13,
    "company_size": 2
}

sample_df = pd.DataFrame([sample_input])
predicted_salary = model.predict(sample_df)[0]
print("Predicted salary:", predicted_salary)

Predicted salary: 106520.63087166785


In [21]:
joblib.dump(model, "salary_predict_model.ml")

['salary_predict_model.ml']