In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("survey_results_public.csv")

In [3]:
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [4]:
df = df[["Country", "DevType", "EdLevel", "YearsCodePro", "Employment", "ConvertedComp"]]
df = df.rename({"ConvertedComp": "Salary"}, axis=1)
df.head()

Unnamed: 0,Country,DevType,EdLevel,YearsCodePro,Employment,Salary
0,Germany,"Developer, desktop or enterprise applications;...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",27.0,"Independent contractor, freelancer, or self-em...",
1,United Kingdom,"Developer, full-stack;Developer, mobile","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",4.0,Employed full-time,
2,Russian Federation,,,,,
3,Albania,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",4.0,,
4,United States,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",8.0,Employed full-time,


In [5]:
df = df[df['Salary'].notnull()]
df.head()

Unnamed: 0,Country,DevType,EdLevel,YearsCodePro,Employment,Salary
7,United States,"Developer, back-end;Developer, desktop or ente...","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",13,Employed full-time,116000.0
9,United Kingdom,"Database administrator;Developer, full-stack;D...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",4,Employed full-time,32315.0
10,United Kingdom,"Developer, back-end;Developer, desktop or ente...","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",2,Employed full-time,40070.0
11,Spain,"Designer;Developer, front-end",Some college/university study without earning ...,7,Employed full-time,14268.0
12,Netherlands,"Designer;Developer, back-end","Secondary school (e.g. American high school, G...",20,Employed full-time,38916.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34756 entries, 7 to 64154
Data columns (total 6 columns):
Country         34756 non-null object
DevType         34367 non-null object
EdLevel         34188 non-null object
YearsCodePro    34621 non-null object
Employment      34717 non-null object
Salary          34756 non-null float64
dtypes: float64(1), object(5)
memory usage: 1.9+ MB


In [7]:
df = df.dropna()
df.isnull().sum()

Country         0
DevType         0
EdLevel         0
YearsCodePro    0
Employment      0
Salary          0
dtype: int64

In [8]:
df['Employment'].unique()

array(['Employed full-time',
       'Independent contractor, freelancer, or self-employed',
       'Employed part-time'], dtype=object)

In [9]:
df['Employment'] = df['Employment'].str.split(";")
df['Employment'] = df['Employment'].str[0]

In [10]:
df['Employment'].unique()

array(['Employed full-time',
       'Independent contractor, freelancer, or self-employed',
       'Employed part-time'], dtype=object)

In [11]:
df = df[df['Employment'] == 'Employed, full-time']
df = df.drop('Employment', axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 5 columns):
Country         0 non-null object
DevType         0 non-null object
EdLevel         0 non-null object
YearsCodePro    0 non-null object
Salary          0 non-null float64
dtypes: float64(1), object(4)
memory usage: 0.0+ bytes


In [12]:
df['DevType'].unique()

array([], dtype=object)

In [13]:
len(df['DevType'].unique())

0

In [14]:
df['DevType'] = df['DevType'].str.split(";")
df['DevType'] = df['DevType'].str[0]


In [15]:
len(df['DevType'].unique())

0

In [16]:
df['DevType'].value_counts()

Series([], Name: DevType, dtype: int64)

In [17]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)):
        if categories.values[i] >= cutoff:
            categorical_map[categories.index[i]] = categories.index[i]
        else:
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [18]:
dev_map = shorten_categories(df['DevType'].value_counts(), 100)
df['DevType']  = df['DevType'].map(dev_map)
df['DevType'].value_counts()

Series([], Name: DevType, dtype: int64)

In [19]:
country_map = shorten_categories(df['Country'].value_counts(), 300)
df['Country']  = df['Country'].map(country_map)
df['Country'].value_counts()

Series([], Name: Country, dtype: int64)