# Salary Prediction Based on Job Descriptions


In [None]:
'''
 This script uses salary data from a train set, builds and evalutes several 
 predictive models, and  uses the best model to predict salary on test data. 
'''
__author__ = "Mahsa Shokouhi"
__email__ = "mahsa_shokouhi@yahoo.com"

## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures


%matplotlib inline

##  Define Functions

In [None]:
def clean_data(df):
    df.drop(['companyId'], axis=1, inplace=True)
    df.drop_duplicates(subset='jobId', inplace=True)  # remove duplicates
    df.drop(['jobId'], axis=1, inplace=True)


def valid_salary(df):
    return df[df['salary'] > 0]  # remove invalid values for salary


def encode_categoricals(features_df, le=True):
    '''
    Encode Categorical Features. Use label-encoding for tree-based models, 
    and one-hot encoding for linear models.
    '''
    numeric_features = features_df.select_dtypes(exclude=['object'])
    categorical_features = features_df.select_dtypes(include=['object'])

    if le:  # label encoding
        encoder = preprocessing.LabelEncoder()
        categorical_features = categorical_features.apply(
            encoder.fit_transform)
    else:  # one-hot encoding
        categorical_features = pd.get_dummies(features_df, drop_first=True)

    return pd.concat([numeric_features, categorical_features], axis=1)


def EDA_Visualize(df):
    ''' Create plots for Exploratory Data Analysis '''
    # Define plot settings
    plt.rcParams['axes.labelsize'] = 16
    plt.rcParams['xtick.labelsize'] = 14
    plt.rcParams['ytick.labelsize'] = 14

    # Distribution of numerical variables
    df.hist(figsize=(12, 3), layout=(1, 3), bins=20)
    plt.suptitle('Distribution of numerical variables', y=1.2, fontsize=20)

    # Salary vs years' experience for each job type
    sns.lmplot(x='yearsExperience', y='salary',
               data=df.sort_values(['salary'], ascending=False), fit_reg=False,
               hue='jobType', aspect=0.8, height=10, palette="Set1")
    plt.title('Changes in salary with years of experience for each job type',
              y=1.2, fontsize=20)

    # Salary by job type and industry
    plt.figure(figsize=(15, 5))
    sns.barplot(x='jobType', y='salary', hue='industry',
                data=df.sort_values(['salary'], ascending=False))
    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
    plt.title('Salary for each industry', y=1.2, fontsize=20)

    # Salary by job type and major
    plt.figure(figsize=(15, 5))
    sns.barplot(x='jobType', y='salary', hue='major',
                data=df.sort_values(['salary'], ascending=False))
    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
    plt.title('Salary for each major', y=1.2, fontsize=20)


def linear_models_():
    ''' Create linear models '''
    lr = make_pipeline(StandardScaler(), PCA(), LinearRegression())
    lr_interaction = make_pipeline(StandardScaler(),
                                   PolynomialFeatures(interaction_only=True),
                                   PCA(), LinearRegression())
    return lr, lr_interaction


def tree_models_():
    ''' Create tree-based models '''
    rf = RandomForestRegressor(n_estimators=200, max_depth=15)
    gb = GradientBoostingRegressor(n_estimators=200, max_depth=5)
    return rf, gb


def validate_models(x_train, y_train, models, k_cv=5,
                    score='neg_mean_squared_error'):
    ''' Cross-validation for evaluating and comparing models '''
    mse_mean = []
    mse_stdev = []
    for model in models:
        crossval = cross_val_score(model, x_train, y_train, cv=k_cv,
                                   scoring=score)
        mse_mean.append(-1.0*crossval.mean())
        mse_stdev.append(crossval.std())
    return mse_mean, mse_stdev


def validation_summary(model, mse_mean, mse_stdev):
    ''' Print a summary of models evaluation results '''
    print('Model:\n', model)
    print()
    print('Mean Squared Error: Average = {:.2f} , Standard deviation = {:.2f}'.format(
        mse_mean, mse_stdev), '\n\n')

## Load the datasets

In [None]:
train_features = pd.read_csv('data/train_features.csv', header=0)
target = pd.read_csv('data/train_salaries.csv')
test_features = pd.read_csv('data/test_features.csv')

train = pd.merge(train_features, target, how='inner',
                 left_on='jobId', right_on='jobId')

## Clean the data

In [None]:
clean_data(train)
clean_data(test_features)
train = valid_salary(train)

print(train.shape, test_features.shape)

## Exploratory Data Analysis

In [None]:
EDA_Visualize(train)

In [None]:
# Summary statistics
print(train.select_dtypes(include=['object']).describe())
print('\n')
print(train.describe())

## Establish a baseline : 
#### Predicting the salary based on average salary for the industry and jobType

In [None]:
# Baseline predicts: salary = averageSalary for the industry and jobType
BL_model = train.copy()
BL_model['predictedSalary'] = BL_model.groupby(['industry', 'jobType'])[
    'salary'].transform('mean')

print(BL_model.head())

mse_BL_model = mean_squared_error(BL_model['salary'],
                                  BL_model['predictedSalary'])
print('\n\nThe mean squared error of predicted salary  is: ', mse_BL_model)

## Encode Categorical Variables

In [None]:
features = train.drop(['salary'], axis=1)

# One-hot encode to be used for linear models
train_linear = encode_categoricals(features, le=False)
# Label encoding to be used in tree-based methods
train_tree = encode_categoricals(features, le=True)

In [None]:
# Correlation plot
plt.figure(figsize=(15, 10))
sns.heatmap(train_linear.corr(), xticklabels=train_linear.columns,
            yticklabels=train_linear.columns, cmap="RdBu_r")
plt.title('Correlation between variables', fontsize=20)

## Create and Validate Models

In [None]:
# Cross-validation for linear models
lm_models = linear_models_()
mse_mean_lr, mse_stdev_lr = validate_models(
    train_linear, train.salary, lm_models)

# Cross-validation for tree-based models
tree_models = tree_models_()
mse_mean_tree, mse_stdev_tree = validate_models(
    train_tree, train.salary, tree_models)

In [None]:
models = lm_models + tree_models
mse_means = mse_mean_lr + mse_mean_tree
mse_stdevs = mse_stdev_lr + mse_stdev_tree

for i in range(len(models)):
    validation_summary(models[i], mse_means[i], mse_stdevs[i])

## Select the Best Model and Predict on Test Set


In [None]:
idx = np.argmin(mse_means)  # index of the model with minimum mse
models[idx]  # Best model

In [None]:
# Use gradient boosting for test set (with similar results to the best model)
selected_model = models[-1]

test = encode_categoricals(test_features, le=True)  # for tree-based model
selected_model.fit(train_tree, train.salary)

predictions = selected_model.predict(test)

In [None]:
# Get Feature Importances
importances = selected_model.feature_importances_

feature_importance_df = pd.DataFrame(
    importances, columns=['Feature Importance'], index=test.columns)

feature_importance_df.sort_values(
    by='Feature Importance', ascending=False, inplace=True)

# Feature importance plot
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x='Feature Importance', y=feature_importance_df.index,
            data=feature_importance_df, color="b")

In [None]:
# Save results
with open('model.txt', 'w') as file:
    file.write(str(selected_model))


np.savetxt('predictions.csv', predictions, delimiter=',')

feature_importance_df.to_csv('feature_importance.csv')