<a href="https://colab.research.google.com/github/GiorgosNik/dev-salary-estimator/blob/main/salary_estimator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [166]:
!pip install -U -q PyDrive
!pip install -U -q geocoder
!pip install -U -q tqdm

In [167]:
import tensorflow as tf
import pandas as pd
import numpy as np
from collections import Counter
from geopy.geocoders import Nominatim
from tqdm import tqdm

# CSV import from Google Drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Import and Format Data

## Data Import from Google Drive
Authenticate with GoogleAuth to retrieve the data .csv from Google Drive.
The retrieved .csv file is stored in a pandas dataframe.

In [168]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# ID of Google Drive .csv document
id = "1cPsSR9XfyqOl15KqGV3BEqaxmL1R7lqq"
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
dataframe = pd.read_csv('Filename.csv')

In [169]:
dataframe.columns = ['timestamp', 'devtype', 'languages', 'years_experience','personal_projects', 'sex', 'remote','city_residence', 'city_work', 'company_size',  'supervisor', 'education', 'relevant', 'salary']

# Remove the timestamp as it is irelevant
dataframe = dataframe.drop(columns=['timestamp'])

print("The dataset contains {} salary entries".format( len(dataframe)))
dataframe.head(2)

The dataset contains 807 salary entries


Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"Backend, Frontend","JavaScript, PHP",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,39361.0
1,"Backend, Frontend",JavaScript,7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,30142.0


## Remove entries that contain very rare developer types
Entries that occur under 5 times are considered rare.

In [170]:
UNK = 'unk'

def categorize_clean_columns(df, colname, threshold):
    df[colname] = df[colname].map(lambda x: x.replace(' ', '').split(','))
    c = Counter([item for sublist in df[colname].values for item in sublist])

    map_to_unk = set([el for el in c.elements() if c[el] <= threshold])

    def map_devtype_unk(arr):
      def x_or_unk(x):
        if x in map_to_unk:
          return UNK
        return x

      return [x_or_unk(x) for x in arr]

    df[colname] = df[colname].map(lambda x: map_devtype_unk(x))

    def is_sole_uknown(arr):
      return arr[0] == UNK and len(arr) == 1

    return df[df[colname].map(is_sole_uknown) == False]

In [171]:
dataframe = categorize_clean_columns(dataframe, 'devtype', 5)
dataframe = categorize_clean_columns(dataframe, 'languages', 5)

len(dataframe) # ~ 20 rows with few entries

738

In [172]:
def fix_salary(salary):
    salary = salary.replace('.', '')
    salary = salary.replace(',', '')
    salary = salary.replace('€', '')
    salary = int(salary)
    if salary < 4000:
      return salary * 14
    
    return salary

dataframe['salary'] = dataframe['salary'].map(lambda x: fix_salary(str(x))).astype('float32')
dataframe.head(2)

Unnamed: 0,devtype,languages,years_experience,personal_projects,sex,remote,city_residence,city_work,company_size,supervisor,education,relevant,salary
0,"[Backend, Frontend]","[JavaScript, PHP]",13,Ναι,Άντρας,Και τα δύο,Αθήνα,Αθήνα,501+,Όχι,Bachelor's,Ναι,393610.0
1,"[Backend, Frontend]",[JavaScript],7,Ναι,Άντρας,Απομακρυσμένα,Αθήνα,Αθήνα,201 - 500,Όχι,Bachelor's,Ναι,301420.0


In [173]:
cities_residence, cities_work = pd.unique(dataframe['city_residence'].values), pd.unique(dataframe['city_work'].values)

In [175]:
geolocator = Nominatim(user_agent="test")


with tqdm(total=len(cities_residence), desc='Formatting City Names') as city_progressbar:
  for city in cities_residence:
    try:
      area = geolocator.geocode(city).address.split(",")[0]
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,area)

    except AttributeError:
      dataframe['city_residence'] = dataframe['city_residence'].replace(city,"UNK")
    city_progressbar.update(1)

dataframe[dataframe.years_experience != "UNK"]
print("##### Unique Cities #####")
pd.unique(dataframe['city_residence'].values)

Formatting City Names: 100%|██████████| 83/83 [00:41<00:00,  1.99it/s]

##### Unique Cities #####





array(['Αθήνα', 'Κέρκυρα', 'City of Edinburgh', 'Θεσσαλονίκη', 'Σέρρες',
       'Μαυρομμάτι', 'Μύρινα', 'Berlin', 'Πτολεμαΐδα', 'Serres',
       'Χαλκίδα', 'Χανιά', 'Κομοτηνή', 'Ιωάννινα', 'Ναύπλιο', 'Bolo',
       'Bratislava', 'Δήμος Βόλου', 'Ξάνθη', 'Δήμος Τρικκαίων',
       'Amsterdam', 'Κοζάνη', 'München', 'Δήμος Πατρέων', 'Graz',
       'Italia', 'Reading', 'Λευκωσία - Lefkoşa', 'کابل', 'Stockholm',
       'Utrecht', 'Oxford', 'Δήμος Καλαμάτας', 'Αγρίνιο', 'Κεφαλονιά',
       'Ηράκλειο', 'Genève', 'Ρόδος', 'Brno', 'København', 'Αμαλιάδα',
       'Πρέβεζα', 'London', 'Λεμεσός', 'LocalHost', 'Δράμα', 'Glasgow',
       'Σχηματάρι', 'Αρχαία Πίσα', 'Zürich', 'Λευκάδα', 'Άρτα', 'Λάρισα',
       'Wien', 'Λαμία', 'Δήμος Λάρνακας', 'Ζευγολατιό', 'Leeds', 'UNK',
       'استان کردستان', 'Bologna', '贵州省', 'Ρέθυμνο', 'Κύπρος - Kıbrıs',
       'Göteborg', 'Ηγουμενίτσα', 'Hlavní město Praha', 'Dublin',
       'Καστοριά', 'Tiranë', 'Αρεόπολη'], dtype=object)