<a href="https://colab.research.google.com/github/GiorgosNik/dev-salary-estimator/blob/main/salary_estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [318]:
!pip install -U -q PyDrive
!pip install -U -q geocoder
!pip install -U -q tqdm
!pip install -U -q tensorflow

In [319]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import Counter
from geopy.geocoders import Nominatim
from tqdm import tqdm
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# CSV import from Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Import and Format Data

## Data Import from Google Drive
Authenticate with GoogleAuth to retrieve the data .csv from Google Drive.
The retrieved .csv file is stored in a pandas dataframe.

In [320]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# ID of Google Drive .csv document
id = "1cPsSR9XfyqOl15KqGV3BEqaxmL1R7lqq"
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
dataframe = pd.read_csv('Filename.csv')

In [321]:
dataframe.columns = ['timestamp', 'devtype', 'languages', 'years_experience','personal_projects', 'sex', 'remote','city_residence', 'city_work', 'company_size',  'supervisor', 'education', 'relevant', 'salary']

# Remove the timestamp as it is irelevant
dataframe = dataframe.drop(columns=['timestamp'])

print("The dataset contains {} salary entries".format( len(dataframe)))
dataframe.head(2)

The dataset contains 807 salary entries


Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"Backend, Frontend","JavaScript, PHP",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,39361.0
1,"Backend, Frontend",JavaScript,7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,30142.0


## Remove entries that contain very rare developer types
Entries that occur under 5 times are considered rare.

In [322]:
UNK = 'unk'

def categorize_clean_columns(df, colname, threshold):
    df[colname] = df[colname].map(lambda x: x.replace(' ', '').split(','))
    c = Counter([item for sublist in df[colname].values for item in sublist])

    map_to_unk = set([el for el in c.elements() if c[el] <= threshold])

    def map_devtype_unk(arr):
      def x_or_unk(x):
        if x in map_to_unk:
          return UNK
        return x

      return [x_or_unk(x) for x in arr]

    df[colname] = df[colname].map(lambda x: map_devtype_unk(x))

    def is_sole_uknown(arr):
      return arr[0] == UNK and len(arr) == 1

    return df[df[colname].map(is_sole_uknown) == False]

In [323]:
dataframe = categorize_clean_columns(dataframe, 'devtype', 5)
dataframe = categorize_clean_columns(dataframe, 'languages', 5)

len(dataframe) # ~ 20 rows with few entries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colname] = df[colname].map(lambda x: x.replace(' ', '').split(','))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colname] = df[colname].map(lambda x: map_devtype_unk(x))


738

In [324]:
def fix_salary(salary):
    salary = salary.replace('.', '')
    salary = salary.replace(',', '')
    salary = salary.replace('€', '')
    salary = int(salary)
    if salary < 4000:
      salary = salary * 14
    
    return salary

dataframe['salary'] = dataframe['salary'].map(lambda x: fix_salary(str(int(x)))).astype('int32')
dataframe.head(2)

Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"[Backend, Frontend]","[JavaScript, PHP]",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,39361
1,"[Backend, Frontend]",[JavaScript],7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,30142


In [325]:
cities_residence, cities_work = pd.unique(dataframe['city_residence'].values), pd.unique(dataframe['city_work'].values)

In [326]:
geolocator = Nominatim(user_agent="test")

# TODO: experiment with country
with tqdm(total=len(cities_residence), desc='Formatting City Names') as city_progressbar:
  for city in cities_residence:
    try:
      area = geolocator.geocode(city).address.split(",")[0]
      country = geolocator.geocode(city).address.split(",")[-1].strip()
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,country)

    except AttributeError:
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,"UNK")
    city_progressbar.update(1)

dataframe = dataframe[dataframe.years_experience != "UNK"]
dataframe = dataframe[dataframe.city_residence == "Ελλάς"]
print("##### Unique Cities #####")
pd.unique(dataframe['city_residence'].values)

dataframe = dataframe.drop(columns=['city_residence', 'city_work'])
dataframe.head(2)

Formatting City Names: 100%|██████████| 83/83 [01:21<00:00,  1.01it/s]

##### Unique Cities #####





Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,company_size,supervisor,education,relevant,salary
0,"[Backend, Frontend]","[JavaScript, PHP]",13,Ναι,Άντρας,Και τα δύο,501+,Όχι,Bachelor's,Ναι,39361
1,"[Backend, Frontend]",[JavaScript],7,Ναι,Άντρας,Απομακρυσμένα,201 - 500,Όχι,Bachelor's,Ναι,30142


In [327]:
dataframe['relevant'] = dataframe['relevant'].fillna('Όχι')

In [328]:
category_columns = ['company_size', 'remote', 'supervisor', 'personal_projects', 'sex','education','relevant']

for col in category_columns:
    dataframe[f'{col}_xf'] = dataframe[col].astype('category')

dataframe = dataframe.drop(columns=category_columns)

In [329]:
dataframe['languages'] = dataframe['languages'].replace("Typescript","TypeScript")

def fix_languages(languages):
    for i in range(len(languages)):
      if(languages[i] == "Typescript"):
        languages[i] = "TypeScript"
      languages[i] = languages[i].strip()
    return languages
    
dataframe['languages'] = dataframe['languages'].map(lambda x: fix_languages(x))

temp = dataframe[dataframe.languages == "Typescript"]
temp.head(2)

Unnamed: 0,devtype,languages,years_experience,salary,company_size_xf,remote_xf,supervisor_xf,personal_projects_xf,sex_xf,education_xf,relevant_xf


In [330]:
multi_category_columns = ['devtype', 'languages']

def coltitle(col, word):
    return f'{col}_{word}'

for col in multi_category_columns:
    vocab = set([item for sublist in dataframe[col].values for item in sublist])
    for word in vocab:
        dataframe[coltitle(col, word)] = 0
        dataframe[coltitle(col, word)] = dataframe[coltitle(col, word)].astype('int32')
    print(vocab)

for index, row in dataframe.iterrows():
    for col in multi_category_columns:
        words = row[col]
        for word in words:
            dataframe.loc[index, coltitle(col, word)] = 1

bad_tf_scope_names = [('languages_C#', 'languages_Csharp'), ('languages_C++', 'languages_Cpp')]
for before, after in bad_tf_scope_names:
    dataframe[after] = dataframe[before]
    dataframe = dataframe.drop(columns=[before])


{'Gaming', 'Frontend', 'Cybersecurity', 'DevOps', 'Embedded', 'BI', 'unk', 'AI/ML', 'Desktopapps', 'Mobileapps', 'Backend'}
{'JavaScript', 'Python', 'TypeScript', 'C#', 'C++', 'SQL', 'C', 'Swift', 'unk', 'Bash', 'Ruby', 'Go', 'PHP', 'Kotlin', 'Java'}


In [331]:
dataframe = dataframe.drop(columns=multi_category_columns)

In [332]:
train, val = train_test_split(dataframe, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')

544 train examples
137 validation examples


In [333]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('salary')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
      ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(train, batch_size=32)
eval_ds = df_to_dataset(train, batch_size=32, shuffle=True)

In [334]:
for feature_batch, label_batch in train_ds.take(1):
  feature_column_keys = list(feature_batch.keys())
  print('Every feature:', feature_column_keys)

Every feature: ['years_experience', 'company_size_xf', 'remote_xf', 'supervisor_xf', 'personal_projects_xf', 'sex_xf', 'education_xf', 'relevant_xf', 'devtype_Gaming', 'devtype_Frontend', 'devtype_Cybersecurity', 'devtype_DevOps', 'devtype_Embedded', 'devtype_BI', 'devtype_unk', 'devtype_AI/ML', 'devtype_Desktopapps', 'devtype_Mobileapps', 'devtype_Backend', 'languages_JavaScript', 'languages_Python', 'languages_TypeScript', 'languages_SQL', 'languages_C', 'languages_Swift', 'languages_unk', 'languages_Bash', 'languages_Ruby', 'languages_Go', 'languages_PHP', 'languages_Kotlin', 'languages_Java', 'languages_Csharp', 'languages_Cpp']


In [335]:
feature_columns = []

numeric_features = [x for x in feature_column_keys if ('devtype_' in x or 'languages_' in x or 'years_experience' in x)]
categorical_features = [x for x in feature_column_keys if '_xf' in x]

for feature in numeric_features:
    feature_columns.append(feature_column.numeric_column(feature))

for feature in categorical_features:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
      feature, pd.unique(dataframe[feature].values))

    feature_columns.append(feature_column.indicator_column(categorical_column))

print(feature_columns)

[NumericColumn(key='years_experience', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_Gaming', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_Frontend', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_Cybersecurity', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_DevOps', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_Embedded', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_BI', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_unk', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_AI/ML', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='devtype_Desk

In [341]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(8, activation='relu'),  
  layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
              loss='mae',
              metrics=['mae'])

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]
print(train_ds)
model.fit(train_ds,
          validation_data=eval_ds,
          epochs=100,
          callbacks=callbacks)

<_BatchDataset element_spec=({'years_experience': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'company_size_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'remote_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'supervisor_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'personal_projects_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'sex_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'education_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'relevant_xf': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'devtype_Gaming': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Frontend': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Cybersecurity': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_DevOps': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_Embedded': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'devtype_BI': TensorSpec(s



 1/17 [>.............................] - ETA: 16s - loss: 17734.1445 - mae: 17734.1445



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x7efec6fa0eb0>

In [345]:
input = {
    'years_experience': 1,
    'company_size_xf': '11-50',
    'education_xf': "Bachelor's",
    'relevant_xf': "Ναι",
    'personal_projects_xf': 'Ναι',
    'remote_xf': 'Και τα δύο',
    'supervisor_xf': 'Ναι',
    'sex_xf': 'Άντρας',
    'devtype_Backend': 1 ,              
    'devtype_Desktopapps': 0,         
    'devtype_DevOps': 1,            
    'devtype_AI/ML': 0 ,       
    'devtype_BI': 0 ,
    'devtype_Cybersecurity': 0,
    'devtype_Embedded': 0,
    'devtype_Gaming': 0,                     
    'devtype_unk': 0,   
    'devtype_Frontend': 1,        
    'devtype_Mobileapps': 0,           
    'languages_C': 0 ,
    'languages_SQL': 0,              
    'languages_PHP': 0 ,             
    'languages_JavaScript': 1 ,         
    'languages_Kotlin' : 0 ,
    'languages_TypeScript' : 0,              
    'languages_Python': 1 ,            
    'languages_Ruby': 0 ,               
    'languages_Bash': 0,             
    'languages_Go': 0 ,             
    'languages_Java': 0 ,            
    'languages_Swift': 0 ,            
    'languages_Csharp': 0 ,         
    'languages_Cpp': 0 , 
    'languages_unk': 0 # any language that is not used above ?                
}

input = {k: [v] for k, v in input.items()}

prediction = model(input).numpy()[0][0]
f'Βγάζεις: {prediction} ευρώ το χρόνο'



'Βγάζεις: 14720.2705078125 ευρώ το χρόνο'