<a href="https://colab.research.google.com/github/GiorgosNik/dev-salary-estimator/blob/main/salary_estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [30]:
%pip install -U -q PyDrive
%pip install -U -q geocoder
%pip install -U -q tqdm
%pip install -U -q tensorflow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [31]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import Counter
from geopy.geocoders import Nominatim
from tqdm import tqdm
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Import and Format Data

## Data Import from GitHub repo
The retrieved .csv file is stored in a pandas df.

In [32]:
# Retrieve the dataset csv from GitHub
csv_url = "https://raw.githubusercontent.com/GiorgosNik/dev-salary-estimator/main/salary_data_2022.csv"
file_path = tf.keras.utils.get_file("salary_report", csv_url)
df = pd.read_csv(file_path)


In [33]:
df.columns = [
    "timestamp",
    "devtype",
    "languages",
    "years_experience",
    "personal_projects",
    "sex",
    "remote",
    "city_residence",
    "city_work",
    "company_size",
    "supervisor",
    "education",
    "relevant",
    "salary",
]

# Remove the timestamp as it is not relevant
df = df.drop(columns=["timestamp"])

pd.options.mode.chained_assignment = None

dataset_size = len(df)
print("The dataset contains {} salary entries".format(dataset_size))
df.head(2)


The dataset contains 807 salary entries


Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"Backend, Frontend","JavaScript, PHP",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,39361.0
1,"Backend, Frontend",JavaScript,7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,30142.0


## Remove entries that contain very rare developer types or languages
Entries that occur under 5 times are considered rare.

In [34]:
UNK = "unk"


def categorize_clean_columns(df, col_name, threshold):
    df[col_name] = df[col_name].map(lambda x: x.replace(" ", "").split(","))
    c = Counter([item for sublist in df[col_name].values for item in sublist])

    map_to_unk = set([el for el in c.elements() if c[el] <= threshold])

    def map_devtype_unk(arr):
        def x_or_unk(x):
            if x in map_to_unk:
                return UNK
            return x

        return [x_or_unk(x) for x in arr]

    df[col_name] = df[col_name].map(lambda x: map_devtype_unk(x))

    def is_sole_unknown(arr):
        return arr[0] == UNK and len(arr) == 1

    return df[df[col_name].map(is_sole_unknown) == False]


In [35]:
def fix_devtype(dev_types):
    return list(filter(lambda value: value != UNK, dev_types))


def fix_languages(languages):
    for i in range(len(languages)):
        if languages[i] == "Typescript":
            languages[i] = "TypeScript"
        languages[i] = languages[i].strip()
    return list(filter(lambda value: value != UNK, languages))


In [36]:
df = categorize_clean_columns(df, "devtype", 5)
df = categorize_clean_columns(df, "languages", 5)

df["languages"] = df["languages"].map(lambda x: fix_languages(x))
df["devtype"] = df["devtype"].map(lambda x: fix_devtype(x))

df = df[df["languages"].map(lambda d: len(d)) > 0]
df[df["devtype"].map(lambda d: len(d)) > 0]

print("After removing rare entries, the dataset contains {} entries.".format(len(df)))
print("This is {} less than the initial dataset.".format(dataset_size - len(df)))


After removing rare entries, the dataset contains 737 entries.
This is 70 less than the initial dataset.


## Fix Salary
Normalize the salary input. Remove €/$ signs, commas and period signs.
For values under 4000, consider the user gave a montly salary by mistake, and calculate the yearly salary by multiplying by 14 salaries.  

In [37]:
def fix_salary(salary):
    salary = salary.replace(".", "")
    salary = salary.replace(",", "")
    salary = salary.replace("€", "")
    salary = salary.replace("$", "")
    salary = int(salary)
    if salary < 4000:
        salary = salary * 14

    return salary


df["salary"] = df["salary"].map(lambda x: fix_salary(str(int(x)))).astype("int32")
df.head(5)


Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"[Backend, Frontend]","[JavaScript, PHP]",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,39361
1,"[Backend, Frontend]",[JavaScript],7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,30142
2,"[Backend, Frontend, BI]","[Python, JavaScript, PHP]",1,Όχι,Άντρας,Απομακρυσμένα,Κέρκυρα,Κέρκυρα,1 - 10,Όχι,Bachelor's,Ναι,8450
3,[Backend],"[C#, JavaScript]",2,Όχι,Άντρας,Απομακρυσμένα,Edinburgh,Edinburgh,501+,Όχι,Master,Ναι,30551
4,[Mobileapps],[Kotlin],3,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,51 - 100,Ναι,Bachelor's,Ναι,20351


## Consider only results from Greece
Using geopy, use the given city names to obtain the country of each response.
Reject all values not relevant to Greece due to low sample size.

In [38]:
cities_residence, cities_work = pd.unique(df["city_residence"].values), pd.unique(df["city_work"].values)

geo_locator = Nominatim(user_agent="test")

with tqdm(total=len(cities_residence), desc="Formatting City Names") as city_progressbar:
    for city in cities_residence:
        try:
            country = geo_locator.geocode(city).address.split(",")[-1].strip()
            df["city_residence"] = df["city_residence"].replace(city, country)

        except AttributeError:
            df["city_residence"] = df["city_residence"].replace(city, "UNK")
        city_progressbar.update(1)

df = df[df.city_residence == "Ελλάς"]

df = df.drop(columns=["city_residence", "city_work"])
df.head(2)


Formatting City Names: 100%|██████████| 83/83 [00:41<00:00,  2.00it/s]


Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,company_size,supervisor,education,relevant,salary
0,"[Backend, Frontend]","[JavaScript, PHP]",13,Ναι,Άντρας,Και τα δύο,501+,Όχι,Bachelor's,Ναι,39361
1,"[Backend, Frontend]",[JavaScript],7,Ναι,Άντρας,Απομακρυσμένα,201 - 500,Όχι,Bachelor's,Ναι,30142


## Replace NA values
Replace NA values regarding relevant projects with "No"

In [39]:
df["relevant"] = df["relevant"].fillna("Όχι")


# Define Feature Columns

In [40]:
category_columns = [
    "company_size",
    "remote",
    "supervisor",
    "personal_projects",
    "sex",
    "education",
    "relevant",
]

for col in category_columns:
    df[f"{col}_xf"] = df[col].astype("category")

df = df.drop(columns=category_columns)


In [41]:
multi_category_columns = ["devtype", "languages"]


def col_title(col, word):
    return f"{col}_{word}"


for col in multi_category_columns:
    vocab = set([item for sublist in df[col].values for item in sublist])
    for word in vocab:
        df[col_title(col, word)] = 0
        df[col_title(col, word)] = df[col_title(col, word)].astype("int32")
    print(vocab)

for index, row in df.iterrows():
    for col in multi_category_columns:
        words = row[col]
        for word in words:
            df.loc[index, col_title(col, word)] = 1

bad_tf_scope_names = [("languages_C#", "languages_Csharp"), ("languages_C++", "languages_Cpp")]
for before, after in bad_tf_scope_names:
    df[after] = df[before]
    df = df.drop(columns=[before])

df = df.drop(columns=multi_category_columns)


{'AI/ML', 'BI', 'Frontend', 'Embedded', 'Mobileapps', 'Cybersecurity', 'Backend', 'Desktopapps', 'DevOps', 'Gaming'}
{'TypeScript', 'Go', 'C', 'Java', 'Ruby', 'SQL', 'PHP', 'Swift', 'C#', 'Kotlin', 'Bash', 'Python', 'C++', 'JavaScript'}


## Split the dataset into training and validation sets

In [42]:
train, val = train_test_split(df, test_size=0.2)
print(len(train), "train examples")
print(len(val), "validation examples")


544 train examples
136 validation examples


In [43]:
def df_to_dataset(df, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop("salary")
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    return ds


train_ds = df_to_dataset(train, batch_size=32)
eval_ds = df_to_dataset(train, batch_size=32, shuffle=True)


In [44]:
for feature_batch, label_batch in train_ds.take(1):
    feature_column_keys = list(feature_batch.keys())
    print("Every feature:", feature_column_keys)


Every feature: ['years_experience', 'company_size_xf', 'remote_xf', 'supervisor_xf', 'personal_projects_xf', 'sex_xf', 'education_xf', 'relevant_xf', 'devtype_AI/ML', 'devtype_BI', 'devtype_Frontend', 'devtype_Embedded', 'devtype_Mobileapps', 'devtype_Cybersecurity', 'devtype_Backend', 'devtype_Desktopapps', 'devtype_DevOps', 'devtype_Gaming', 'languages_TypeScript', 'languages_Go', 'languages_C', 'languages_Java', 'languages_Ruby', 'languages_SQL', 'languages_PHP', 'languages_Swift', 'languages_Kotlin', 'languages_Bash', 'languages_Python', 'languages_JavaScript', 'languages_Csharp', 'languages_Cpp']


2023-04-15 17:26:23.869859: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype int32 and shape [544]
	 [[{{node Placeholder/_13}}]]
2023-04-15 17:26:23.870857: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_14' with dtype int32 and shape [544]
	 [[{{node Placeholder/_14}}]]


## Encode features into columns

In [45]:
feature_columns = []

numeric_features = [
    x for x in feature_column_keys if ("devtype_" in x or "languages_" in x or "years_experience" in x)
]
categorical_features = [x for x in feature_column_keys if "_xf" in x]

for feature in numeric_features:
    feature_columns.append(feature_column.numeric_column(feature))

for feature in categorical_features:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
        feature, pd.unique(df[feature].values)
    )

    feature_columns.append(feature_column.indicator_column(categorical_column))

print(feature_columns)


Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
[NumericColumn(key='years_experience', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_AI/ML', shape=(1,), default_value=None, dtype=tf.float32, norm

# Train the model

In [46]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

model = tf.keras.Sequential([feature_layer, layers.Dense(8, activation="relu"), layers.Dense(1)])

model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9), loss="mae", metrics=["mae"])

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath="model.{epoch:02d}-{val_loss:.2f}.h5"),
    tf.keras.callbacks.TensorBoard(log_dir="./logs"),
]
print(train_ds)
model.fit(train_ds, validation_data=eval_ds, epochs=100, callbacks=callbacks)


<_BatchDataset element_spec=({'years_experience': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'company_size_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'remote_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'supervisor_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'personal_projects_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'sex_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'education_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'relevant_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'devtype_AI/ML': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_BI': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Frontend': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Embedded': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Mobileapps': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Cybersecurity': TensorSpe

2023-04-15 17:26:24.217719: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_11' with dtype string and shape [544]
	 [[{{node Placeholder/_11}}]]
2023-04-15 17:26:24.219145: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_28' with dtype string and shape [544]
	 [[{{node Placeholder/_28}}]]




2023-04-15 17:26:24.869309: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [544]
	 [[{{node Placeholder/_0}}]]
2023-04-15 17:26:24.870012: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [544]
	 [[{{node Placeholder/_1}}]]


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.callbacks.History at 0x7f42f06fcdf0>

# Try your values

In [48]:
input = {
    'years_experience': 0.5,
    'company_size_xf': '11-50',
    'education_xf': "Bachelor's",
    'relevant_xf': "Ναι",
    'personal_projects_xf': 'Ναι',
    'remote_xf': 'Και τα δύο',
    'supervisor_xf': 'Ναι',
    'sex_xf': 'Άντρας',
    'devtype_Backend': 1 ,              
    'devtype_Desktopapps': 0,         
    'devtype_DevOps': 0,            
    'devtype_AI/ML': 0 ,       
    'devtype_BI': 0 ,
    'devtype_Cybersecurity': 0,
    'devtype_Embedded': 0,
    'devtype_Gaming': 0,                     
    'devtype_Frontend': 1,        
    'devtype_Mobileapps': 0,           
    'languages_C': 0 ,
    'languages_SQL': 0,              
    'languages_PHP': 0 ,             
    'languages_JavaScript': 1 ,         
    'languages_Kotlin' : 0 ,
    'languages_TypeScript' : 0,              
    'languages_Python': 1 ,            
    'languages_Ruby': 0 ,               
    'languages_Bash': 0,             
    'languages_Go': 0 ,             
    'languages_Java': 0 ,            
    'languages_Swift': 0 ,            
    'languages_Csharp': 0 ,         
    'languages_Cpp': 0 , 
}

input = {k: [v] for k, v in input.items()}

prediction = model(input).numpy()[0][0]
f'Βγάζεις: {prediction} ευρώ το χρόνο'



'Βγάζεις: 15461.072265625 ευρώ το χρόνο'