In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [5]:
filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)

In [6]:
# We can see all the information given with the .info()
df.info()
# or
# print(df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 765 entries, 0 to 764
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Timestamp                           765 non-null    object 
 1   Age                                 672 non-null    float64
 2   Gender                              751 non-null    object 
 3   City                                736 non-null    object 
 4   Position                            737 non-null    object 
 5   Years of experience                 732 non-null    float64
 6   Your level                          743 non-null    object 
 7   Current Salary                      750 non-null    float64
 8   Salary one year ago                 596 non-null    float64
 9   Salary two years ago                463 non-null    float64
 10  Are you getting any Stock Options?  742 non-null    object 
 11  Main language at work               750 non-n

In [7]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None) 
pd.set_option('display.expand_frame_repr', False)
df.head()
# The data type of the Compagny size column is not correct (hard to work with).
# The data type of the Zeitstempel column is not correct (hard to work with).
# a last value "0" is useless and undefined.

# many data are fulled with NaN, which mean that the Data is missing

Unnamed: 0,Timestamp,Age,Gender,City,Position,Years of experience,Your level,Current Salary,Salary one year ago,Salary two years ago,Are you getting any Stock Options?,Main language at work,Company size,Company type
0,14/12/2018 12:41:33,43.0,M,München,QA Ingenieur,11.0,Senior,77000.0,76200.0,68000.0,No,Deutsch,100-1000,Product
1,14/12/2018 12:42:09,33.0,F,München,Senior PHP Magento developer,8.0,Senior,65000.0,55000.0,55000.0,No,Deutsch,50-100,Product
2,14/12/2018 12:47:36,32.0,M,München,Software Engineer,10.0,Senior,88000.0,73000.0,54000.0,No,Deutsch,1000+,Product
3,14/12/2018 12:50:15,25.0,M,München,Senior Frontend Developer,6.0,Senior,78000.0,55000.0,45000.0,Yes,English,1000+,Product
4,14/12/2018 12:50:31,39.0,M,München,UX Designer,10.0,Senior,69000.0,60000.0,52000.0,No,English,100-1000,Ecom retailer


In [8]:
df.describe() 

Unnamed: 0,Age,Years of experience,Current Salary,Salary one year ago,Salary two years ago
count,672.0,732.0,750.0,596.0,463.0
mean,32.183036,8.548497,68381.765333,62187.278523,58013.475162
std,5.107268,4.729557,21196.306557,20163.008663,20413.048908
min,21.0,0.0,10300.0,10001.0,10001.0
25%,29.0,5.0,57000.0,52000.0,48000.0
50%,32.0,8.0,65000.0,60000.0,56000.0
75%,35.0,11.0,75000.0,70000.0,67000.0
max,60.0,38.0,200000.0,200000.0,150000.0


In [9]:
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)
df_corrected.corr()

# the highest correlation is the Yearly Bonus 
# the yearly stocks has the lowest correlation

      Age  Years of experience  Current Salary  Salary one year ago  Salary two years ago
0    43.0                 11.0         77000.0              76200.0               68000.0
1    33.0                  8.0         65000.0              55000.0               55000.0
2    32.0                 10.0         88000.0              73000.0               54000.0
3    25.0                  6.0         78000.0              55000.0               45000.0
4    39.0                 10.0         69000.0              60000.0               52000.0
..    ...                  ...             ...                  ...                   ...
760  40.0                  1.0         44000.0              40000.0               40000.0
761   0.0                  1.0         45000.0              40000.0               40000.0
762   0.0                  1.0         45000.0              40000.0               40000.0
763   0.0                  0.0             0.0                  0.0                   0.0
764  31.0 

Unnamed: 0,Age,Years of experience,Current Salary,Salary one year ago,Salary two years ago
Age,1.0,0.352027,0.237973,0.199427,0.173775
Years of experience,0.352027,1.0,0.426481,0.355029,0.370178
Current Salary,0.237973,0.426481,1.0,0.613239,0.566807
Salary one year ago,0.199427,0.355029,0.613239,1.0,0.768826
Salary two years ago,0.173775,0.370178,0.566807,0.768826,1.0


In [10]:


def calculate_accuracy(X, y, model):
    prediction = model.predict(X)
    accuracy = round(100 * accuracy_score(y, prediction), 2)

    return accuracy

In [11]:
# we tried 5 different algorithms, we can now try to preprocesse the data
factor = 3
upper_lim = df_corrected.mean () + df_corrected.std () * factor
lower_lim = df_corrected.mean () - df_corrected.std () * factor

data = df_corrected[(df_corrected < upper_lim) & (df_corrected > lower_lim)]
print(df_corrected)



      Age  Years of experience  Current Salary  Salary one year ago  Salary two years ago
0    43.0                 11.0         77000.0              76200.0               68000.0
1    33.0                  8.0         65000.0              55000.0               55000.0
2    32.0                 10.0         88000.0              73000.0               54000.0
3    25.0                  6.0         78000.0              55000.0               45000.0
4    39.0                 10.0         69000.0              60000.0               52000.0
..    ...                  ...             ...                  ...                   ...
760  40.0                  1.0         44000.0              40000.0               40000.0
761   0.0                  1.0         45000.0              40000.0               40000.0
762   0.0                  1.0         45000.0              40000.0               40000.0
763   0.0                  0.0             0.0                  0.0                   0.0
764  31.0 

In [12]:

df_x = df_corrected.drop(df_corrected.columns[2], axis=1)
X, y = df_x.iloc[:,:], df_corrected.iloc[:, 3]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    acc_train = calculate_accuracy(X_train, y_train, model)
    acc_test = calculate_accuracy(X_test, y_test, model)
    print(f"The in-sample accuracy of {model_name} is {acc_train} "
        f"and its estimated out-of-sample accuracy {acc_test}.")


The in-sample accuracy of KNeighborsClassifier is 73.2 and its estimated out-of-sample accuracy 67.97.
The in-sample accuracy of DecisionTreeClassifier is 100.0 and its estimated out-of-sample accuracy 87.58.
The in-sample accuracy of SVC is 35.46 and its estimated out-of-sample accuracy 33.99.
The in-sample accuracy of RandomForestClassifier is 100.0 and its estimated out-of-sample accuracy 73.2.


In [13]:
# we can also try to change the hyper-parameter in train_test_split
df_x = df_corrected.drop(df_corrected.columns[2], axis=1)
X, y = df_x.iloc[:,:], df_corrected.iloc[:, 3]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    acc_train = calculate_accuracy(X_train, y_train, model)
    acc_test = calculate_accuracy(X_test, y_test, model)
    print(f"The in-sample accuracy of {model_name} is {acc_train} "
        f"and its estimated out-of-sample accuracy {acc_test}.")

The in-sample accuracy of KNeighborsClassifier is 73.84 and its estimated out-of-sample accuracy 74.03.
The in-sample accuracy of DecisionTreeClassifier is 100.0 and its estimated out-of-sample accuracy 85.71.
The in-sample accuracy of SVC is 35.47 and its estimated out-of-sample accuracy 45.45.
The in-sample accuracy of RandomForestClassifier is 100.0 and its estimated out-of-sample accuracy 80.52.


In [14]:
# reloading the data for the next exemple
filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)


In [15]:
# we can add the data from 2019 and 2020 to train the algorithm
csv_file = ("data/IT_Salary_Survey_EU_2018.csv","data/IT_Salary_Survey_EU_2020.csv")
df_training = pd.DataFrame()
df_concat = pd.concat([pd.read_csv(f) for f in csv_file ], ignore_index=True)
# formating the Data to fill 
df_concat = df.select_dtypes(include=['int', 'float'])
df_concat = df_concat.fillna(0)
df_concat['Years of experience'] = df_corrected['Years of experience'].astype(float)
# getting X and y for the training
X_train = df_concat.drop(df_concat.columns[2], axis=1)
y_train =  df_concat.iloc[:, 3]
# getting x and y for the test
X_test = df_corrected.drop(df_corrected.columns[2], axis=1)
y_test = df_corrected.iloc[:, 3]

models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    acc_train = calculate_accuracy(X_train, y_train, model)
    acc_test = calculate_accuracy(X_test, y_test, model)
    print(f"The in-sample accuracy of {model_name} is {acc_train} "
        f"and its estimated out-of-sample accuracy {acc_test}.")

The in-sample accuracy of KNeighborsClassifier is 75.69 and its estimated out-of-sample accuracy 75.69.
The in-sample accuracy of DecisionTreeClassifier is 100.0 and its estimated out-of-sample accuracy 100.0.
The in-sample accuracy of SVC is 36.99 and its estimated out-of-sample accuracy 36.99.
The in-sample accuracy of RandomForestClassifier is 100.0 and its estimated out-of-sample accuracy 100.0.
