In [52]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA

In [5]:
filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)

In [None]:
# We can see all the information given with the .info()
df.info()
# or
# print(df.dtypes)

In [None]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None) 
pd.set_option('display.expand_frame_repr', False)
df.head()
# The data type of the Compagny size column is not correct (hard to work with).
# The data type of the Zeitstempel column is not correct (hard to work with).
# a last value "0" is useless and undefined.

# many data are fulled with NaN, which mean that the Data is missing

In [None]:
df.describe() 

In [None]:
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)
df_corrected.corr()

# the highest correlation is the Yearly Bonus 
# the yearly stocks has the lowest correlation

In [58]:
def calculate_mse(X, y, model):
    prediction = model.predict(X)
    mse = np.round(np.mean((prediction - y) ** 2))

    return mse


def calculate_accuracy(X, y, model):
    prediction = model.predict(X)
    accuracy = round(100 * accuracy_score(y, prediction), 2)

    return accuracy

In [None]:
models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}

filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)
n_folds = 5
df_x = df_corrected.drop(df_corrected.columns[2], axis=1)
X, y = df_x.iloc[:,:], df_corrected.iloc[:, 3]

X_folds = [X.iloc[k::n_folds] for k in range(n_folds)]
y_folds = [y.iloc[k::n_folds] for k in range(n_folds)]


for model_name, model in models.items():
    mean_squared_errors = []
    for idx in range(n_folds):
        X_test, y_test = X_folds[idx], y_folds[idx]
        X_train = pd.concat(X_folds[:idx] + X_folds[idx+1:])
        y_train = pd.concat(y_folds[:idx] + y_folds[idx+1:])

        model.fit(X_train, y_train)
        mse = calculate_mse(X_test, y_test, model)
        mean_squared_errors.append(mse)

        avg_mse = np.round(np.mean(mean_squared_errors))

    print(f"{model_name} - MSE (CV): {avg_mse}.")


In [60]:
# reloading the data for the next exemple
filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)

# we can add the data from 2019 and 2020 to train the algorithm
csv_file = ("data/IT_Salary_Survey_EU_2019.csv","data/IT_Salary_Survey_EU_2020.csv")
df_training = pd.DataFrame()
df_concat = pd.concat([pd.read_csv(f) for f in csv_file ], ignore_index=True)
# formating the Data to fill 
df_concat = df.select_dtypes(include=['int', 'float'])
df_concat = df_concat.fillna(0)
df_concat['Years of experience'] = df_corrected['Years of experience'].astype(float)
# getting X and y for the training
X_train = df_concat.drop(df_concat.columns[2], axis=1)
y_train =  df_concat.iloc[:, 3]
# getting x and y for the test
X_test = df_corrected.drop(df_corrected.columns[2], axis=1)
y_test = df_corrected.iloc[:, 3]

models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    mean_squared_errors = - cross_val_score(
        model, X, y, cv=5, scoring='neg_mean_squared_error')
    avg_mse = np.round(np.mean(mean_squared_errors), 2)
    acc_train = calculate_accuracy(X_train, y_train, model)
    acc_test = calculate_accuracy(X_test, y_test, model)
    mse = calculate_mse(X_test, y_test, model)
    print(f"The in-sample accuracy of {model_name} is {acc_train} "
        f"and its estimated out-of-sample accuracy {acc_test}")
    print(f"{model_name} - MSE (CV): {avg_mse}.")




The in-sample accuracy of KNeighborsClassifier is 75.69 and its estimated out-of-sample accuracy 75.69
KNeighborsClassifier - MSE (CV): 13742849.85.
The in-sample accuracy of DecisionTreeClassifier is 100.0 and its estimated out-of-sample accuracy 100.0
DecisionTreeClassifier - MSE (CV): 7221278.45.




The in-sample accuracy of SVC is 36.99 and its estimated out-of-sample accuracy 36.99
SVC - MSE (CV): 75957571.33.




The in-sample accuracy of RandomForestClassifier is 100.0 and its estimated out-of-sample accuracy 100.0
RandomForestClassifier - MSE (CV): 11328468.2.


In [55]:
models = {'KNeighborsClassifier': KNeighborsClassifier(),
              'DecisionTreeClassifier': DecisionTreeClassifier(),
              'SVC': SVC(),
              'RandomForestClassifier': RandomForestClassifier()}

filepath = "data/IT_Salary_Survey_EU_2018.csv"
df = pd.read_csv(filepath)
df_corrected = df.select_dtypes(include=['int', 'float'])
df_corrected = df_corrected.fillna(0)
df_corrected['Years of experience'] = df_corrected['Years of experience'].astype(float)
n_folds = 5
df_x = df_corrected.drop(df_corrected.columns[2], axis=1)
X, y = df_x.iloc[:,:], df_corrected.iloc[:, 3]

X = X.to_numpy()
mean = np.mean(X, axis=0, keepdims=True)
X_centered = X - mean
pca = PCA(n_components=2)
pca.fit(X_centered)
X_projected = pca.transform(X_centered)


for model_name, model in models.items():
    mean_squared_errors = - cross_val_score(
        model, X_projected, y, cv=5, scoring='neg_mean_squared_error')
    avg_mse = np.round(np.mean(mean_squared_errors), 2)


    print(f"{model_name} - MSE (CV): {avg_mse}.")




KNeighborsClassifier - MSE (CV): 13743751.81.
DecisionTreeClassifier - MSE (CV): 15009552.1.
SVC - MSE (CV): 59609793.56.




RandomForestClassifier - MSE (CV): 21110774.32.
