<a href="https://colab.research.google.com/github/GiorgosNik/dev-salary-estimator/blob/main/salary_estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install -U -q PyDrive
!pip install -U -q geocoder
!pip install -U -q tqdm
!pip install -U -q tensorflow

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import Counter
from geopy.geocoders import Nominatim
from tqdm import tqdm
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# CSV import from Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Import and Format Data

## Data Import from Google Drive
Authenticate with GoogleAuth to retrieve the data .csv from Google Drive.
The retrieved .csv file is stored in a pandas dataframe.

In [None]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# ID of Google Drive .csv document
id = "1cPsSR9XfyqOl15KqGV3BEqaxmL1R7lqq"
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('data.csv')  
dataframe = pd.read_csv('data.csv')

In [None]:
dataframe.columns = ['timestamp', 'devtype', 'languages', 'years_experience','personal_projects', 'sex', 'remote','city_residence', 'city_work', 'company_size',  'supervisor', 'education', 'relevant', 'salary']

# Remove the timestamp as it is not relevant
dataframe = dataframe.drop(columns=['timestamp'])

pd.options.mode.chained_assignment = None

dataset_size = len(dataframe)
print("The dataset contains {} salary entries".format( dataset_size))
dataframe.head(2)

## Remove entries that contain very rare developer types or languages
Entries that occur under 5 times are considered rare.

In [None]:
UNK = 'unk'

def categorize_clean_columns(df, colname, threshold):
    df[colname] = df[colname].map(lambda x: x.replace(' ', '').split(','))
    c = Counter([item for sublist in df[colname].values for item in sublist])

    map_to_unk = set([el for el in c.elements() if c[el] <= threshold])

    def map_devtype_unk(arr):
      def x_or_unk(x):
        if x in map_to_unk:
          return UNK
        return x

      return [x_or_unk(x) for x in arr]

    df[colname] = df[colname].map(lambda x: map_devtype_unk(x))

    def is_sole_uknown(arr):
      return arr[0] == UNK and len(arr) == 1

    return df[df[colname].map(is_sole_uknown) == False]

In [None]:
def fix_devtype(devtypes):
    return list(filter(lambda value: value != UNK, devtypes))

def fix_languages(languages):
    for i in range(len(languages)):
      if(languages[i] == "Typescript"):
        languages[i] = "TypeScript"
      languages[i] = languages[i].strip()
    return list(filter(lambda value: value != UNK, languages))

In [None]:
dataframe = categorize_clean_columns(dataframe, 'devtype', 5)
dataframe = categorize_clean_columns(dataframe, 'languages', 5)

dataframe['languages'] = dataframe['languages'].map(lambda x: fix_languages(x))
dataframe['devtype'] = dataframe['devtype'].map(lambda x: fix_devtype(x))

dataframe = dataframe[dataframe['languages'].map(lambda d: len(d)) > 0]
dataframe[dataframe['devtype'].map(lambda d: len(d)) > 0]

print("After removing rare entries, the dataset contains {} entries.".format(len(dataframe)))
print("This is {} less than the initial dataser.".format(dataset_size - len(dataframe)))

## Fix Salary
Normalize the salary input. Remove €/$ signs, commas and period signs.
For values under 4000, consider the user gave a montly salary by mistake, and calculate the yearly salary by multiplying by 14 salaries.  

In [None]:
def fix_salary(salary):
    salary = salary.replace('.', '')
    salary = salary.replace(',', '')
    salary = salary.replace('€', '')
    salary = salary.replace('$', '')
    salary = int(salary)
    if salary < 4000:
      salary = salary * 14
    
    return salary

dataframe['salary'] = dataframe['salary'].map(lambda x: fix_salary(str(int(x)))).astype('int32')
dataframe.head(5)

## Consider only results from Greece
Using geopy, use the given city names to obtain the country of each response.
Reject all values not relevant to Greece due to low sample size.

In [None]:
cities_residence, cities_work = pd.unique(dataframe['city_residence'].values), pd.unique(dataframe['city_work'].values)

geolocator = Nominatim(user_agent="test")

with tqdm(total=len(cities_residence), desc='Formatting City Names') as city_progressbar:
  for city in cities_residence:
    try:
      country = geolocator.geocode(city).address.split(",")[-1].strip()
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,country)

    except AttributeError:
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,"UNK")
    city_progressbar.update(1)

dataframe = dataframe[dataframe.city_residence == "Ελλάς"]

dataframe = dataframe.drop(columns=['city_residence', 'city_work'])
dataframe.head(2)

## Replace NA values
Replace NA values regarding relevant projects with "No"

In [None]:
dataframe['relevant'] = dataframe['relevant'].fillna('Όχι')

# Define Feature Columns

In [None]:
category_columns = ['company_size', 'remote', 'supervisor', 'personal_projects', 'sex','education','relevant']

for col in category_columns:
    dataframe[f'{col}_xf'] = dataframe[col].astype('category')

dataframe = dataframe.drop(columns=category_columns)

In [None]:
multi_category_columns = ['devtype', 'languages']

def coltitle(col, word):
    return f'{col}_{word}'

for col in multi_category_columns:
    vocab = set([item for sublist in dataframe[col].values for item in sublist])
    for word in vocab:
        dataframe[coltitle(col, word)] = 0
        dataframe[coltitle(col, word)] = dataframe[coltitle(col, word)].astype('int32')
    print(vocab)

for index, row in dataframe.iterrows():
    for col in multi_category_columns:
        words = row[col]
        for word in words:
            dataframe.loc[index, coltitle(col, word)] = 1

bad_tf_scope_names = [('languages_C#', 'languages_Csharp'), ('languages_C++', 'languages_Cpp')]
for before, after in bad_tf_scope_names:
    dataframe[after] = dataframe[before]
    dataframe = dataframe.drop(columns=[before])

dataframe = dataframe.drop(columns=multi_category_columns)


## Split the dataset into training and validation sets

In [None]:
train, val = train_test_split(dataframe, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('salary')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
      ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(train, batch_size=32)
eval_ds = df_to_dataset(train, batch_size=32, shuffle=True)

In [None]:
for feature_batch, label_batch in train_ds.take(1):
  feature_column_keys = list(feature_batch.keys())
  print('Every feature:', feature_column_keys)

## Encode features into columns

In [None]:
feature_columns = []

numeric_features = [x for x in feature_column_keys if ('devtype_' in x or 'languages_' in x or 'years_experience' in x)]
categorical_features = [x for x in feature_column_keys if '_xf' in x]

for feature in numeric_features:
    feature_columns.append(feature_column.numeric_column(feature))

for feature in categorical_features:
    categorical_column = feature_column.categorical_column_with_vocabulary_list(
      feature, pd.unique(dataframe[feature].values))

    feature_columns.append(feature_column.indicator_column(categorical_column))

print(feature_columns)

# Train the model

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(8, activation='relu'),  
  layers.Dense(1)
])

model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
              loss='mae',
              metrics=['mae'])

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]
print(train_ds)
model.fit(train_ds,
          validation_data=eval_ds,
          epochs=100,
          callbacks=callbacks)

# Try your values

In [None]:
input = {
    'years_experience': 1,
    'company_size_xf': '11-50',
    'education_xf': "Bachelor's",
    'relevant_xf': "Ναι",
    'personal_projects_xf': 'Ναι',
    'remote_xf': 'Και τα δύο',
    'supervisor_xf': 'Ναι',
    'sex_xf': 'Άντρας',
    'devtype_Backend': 1 ,              
    'devtype_Desktopapps': 0,         
    'devtype_DevOps': 1,            
    'devtype_AI/ML': 0 ,       
    'devtype_BI': 0 ,
    'devtype_Cybersecurity': 0,
    'devtype_Embedded': 0,
    'devtype_Gaming': 0,                     
    'devtype_Frontend': 1,        
    'devtype_Mobileapps': 0,           
    'languages_C': 0 ,
    'languages_SQL': 0,              
    'languages_PHP': 0 ,             
    'languages_JavaScript': 1 ,         
    'languages_Kotlin' : 0 ,
    'languages_TypeScript' : 0,              
    'languages_Python': 1 ,            
    'languages_Ruby': 0 ,               
    'languages_Bash': 0,             
    'languages_Go': 0 ,             
    'languages_Java': 0 ,            
    'languages_Swift': 0 ,            
    'languages_Csharp': 0 ,         
    'languages_Cpp': 0 , 
}

input = {k: [v] for k, v in input.items()}

prediction = model(input).numpy()[0][0]
f'Βγάζεις: {prediction} ευρώ το χρόνο'