# Import

**Importing the libraries**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [3]:
%%capture
# this is github gist URL, there are all functions and imports I need to reuse often across many notebooks
!wget -O package.py https://gist.githubusercontent.com/Jankoetf/c36cc24ddd83d4194148a86f87efd397/raw/adf1e7c72dfe8db685ad936f8882d42ac85ae5b7/package1.py
import package

**importing the datasets**

In [4]:
dataset = pd.read_csv('jobfair_train.csv')
dataset = package.basic_preprocessing(dataset)
dataset = package.Feature_Selection(dataset)
dataset = package.Averaging_by_leagues(dataset)
print(dataset.shape)

(55314, 18)


# **Regresional** Neural Network using sci-keras, **basic** aproach

In [4]:
y = dataset['league_rank'].iloc[:].values
dataset = dataset.drop('league_rank', axis = 1)
X = dataset.iloc[:, :].values

print(y.shape)
print(X.shape)

(55314,)
(55314, 17)


## Split, standardization

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

## Creating a model

In [None]:
!pip install scikeras
from scikeras.wrappers import KerasRegressor

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
import tensorflow as tf

def create_model(n_layers=1, n_neurons=32, learning_rate=0.001, l2_reg=0.0001):
    model = Sequential()
    model.add(Dense(n_neurons, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(l2_reg)))
    for _ in range(n_layers - 1):
        model.add(Dense(n_neurons, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_absolute_error', metrics=['mae'])
    return model

## Early stopping

In [20]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import GridSearchCV
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

## Architecture hyperparameters tunning

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model = KerasRegressor(model=create_model, epochs=25, batch_size=32, verbose=0, callbacks=[early_stopping])

param_grid_architecture = {
    'model__n_layers': [1, 2, 3],
    'model__n_neurons': [6, 10, 14]
}

grid_architecture = GridSearchCV(estimator=model, param_grid=param_grid_architecture,scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
grid_architecture.fit(X_train, y_train, validation_data=(X_val, y_val))

In [12]:
best_architecture = grid_architecture.best_params_
print(best_architecture)
best_accuracy = grid_architecture.best_score_
print(best_accuracy)

{'model__n_layers': 3, 'model__n_neurons': 10}
-1.94439122427821


## Regularization hyperparameters tunning

In [13]:
param_grid_hyperparameters = {
    #best architecture:
    'model__n_layers': [3],
    'model__n_neurons': [10],

    #regularization
    'model__learning_rate': [0.0001, 0.001, 0.01],
    'model__l2_reg': [0.0001, 0.001, 0.01]
}

In [14]:
model = KerasRegressor(model=create_model, epochs=25, batch_size=32, verbose=0, callbacks=[early_stopping])

grid_hyperparameters = GridSearchCV(estimator=model, param_grid=param_grid_hyperparameters,scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
grid_hyperparameters_result = grid_hyperparameters.fit(X_train, y_train, validation_data=(X_val, y_val))

In [17]:
best_regularizers = grid_hyperparameters_result.best_params_
print(best_regularizers)
best_accuracy = grid_hyperparameters_result.best_score_
print(best_accuracy)

{'model__l2_reg': 0.0001, 'model__learning_rate': 0.001, 'model__n_layers': 3, 'model__n_neurons': 10}
-1.9406360184295657


## Fune tunning

In [18]:
param_grid_fine = {
    #best architecture:
    'model__n_layers': [3],
    'model__n_neurons': [10],

    #regularization
    'model__learning_rate': [0.001],
    'model__l2_reg': [0.0001],
    'batch_size': [14, 16, 24, 32]
}

In [19]:
model = KerasRegressor(model=create_model, epochs=25, batch_size=32, verbose=0, callbacks=[early_stopping])

grid_fine = GridSearchCV(estimator=model, param_grid=param_grid_fine,scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
grid_fine_result = grid_fine.fit(X_train, y_train, validation_data=(X_val, y_val))

In [20]:
best_fine = grid_fine_result.best_params_
print(best_fine)
best_accuracy = grid_fine_result.best_score_
print(best_accuracy)

{'batch_size': 24, 'model__l2_reg': 0.0001, 'model__learning_rate': 0.001, 'model__n_layers': 3, 'model__n_neurons': 10}
-1.9503732110311371


## testing

In [21]:
#creating best model again for testing
best_model = Sequential()
best_model.add(Dense(10, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.0001)))
best_model.add(Dense(10, activation='relu', kernel_regularizer=l2(0.0001)))
best_model.add(Dense(10, activation='relu', kernel_regularizer=l2(0.0001)))
best_model.add(Dense(1, activation='linear'))
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_absolute_error', metrics=['mae'])

In [None]:
best_model.fit(X_train, y_train, batch_size = 24, epochs = 25, \
        validation_data=(X_val, y_val), callbacks=[early_stopping])

In [None]:
raw_predictions_train = best_model.predict(X_train)
raw_predictions_val = best_model.predict(X_val)
raw_predictions_test = best_model.predict(X_test)

In [14]:
from sklearn.metrics import mean_absolute_error
print("train")
mae_train = mean_absolute_error(y_train, raw_predictions_train)
print(f"Mean Absolute Error - predictions - val set: {mae_train}")
print("val")
mae_val = mean_absolute_error(y_val, raw_predictions_val)
print(f"Mean Absolute Error - predictions - val set: {mae_val}")
print("test")
mae_test = mean_absolute_error(y_test, raw_predictions_test)
print(f"Mean Absolute Error - predictions - val set: {mae_test}")

train
Mean Absolute Error - predictions - val set: 1.9397284713035656
val
Mean Absolute Error - predictions - val set: 1.9655652811861446
test
Mean Absolute Error - predictions - val set: 1.9843720348664553


In [15]:
predictions_train = package.post_processing_1(raw_predictions_train)
predictions_val = package.post_processing_1(raw_predictions_val)
predictions_test = package.post_processing_1(raw_predictions_test)

In [16]:
from sklearn.metrics import mean_absolute_error
print("train")
mae_train = mean_absolute_error(y_train, predictions_train)
print(f"Mean Absolute Error - predictions - val set: {mae_train}")
print("val")
mae_val = mean_absolute_error(y_val, predictions_val)
print(f"Mean Absolute Error - predictions - val set: {mae_val}")
print("test")
mae_test = mean_absolute_error(y_test, predictions_test)
print(f"Mean Absolute Error - predictions - val set: {mae_test}")

train
Mean Absolute Error - predictions - val set: 1.919780986079186
val
Mean Absolute Error - predictions - val set: 1.948415089791491
test
Mean Absolute Error - predictions - val set: 1.963364666184623


After tunning hyperparameters average MAE is around 1.95.

# **Regresional** Neural Network using sci-keras, **adapted** aproach

In [9]:
'''import'''
dataset = pd.read_csv('jobfair_train.csv')

'''sorting'''
dataset = dataset.sort_values(by='league_id')

'''preprocesing'''
dataset = package.basic_preprocessing(dataset)
dataset = package.Feature_Selection(dataset)
dataset = package.Averaging_by_leagues(dataset)
print(dataset.shape)

(55314, 18)


In [10]:
X_train, y_train, X_val, y_val, X_test, y_test = package.train_val_test_split_adapted_shuffled(dataset)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

## Tuning hyperparameters

In [29]:
from sklearn.metrics import mean_absolute_error
def evaluate_network(model, X_val_test, y_true, verbose = 0):
    y_pred = model.predict(X_val_test)
    y_pred_sorted = package.post_processing_1(y_pred)

    mae_val_test = mean_absolute_error(y_true, y_pred)
    mae_val_test_sorted = mean_absolute_error(y_true, y_pred_sorted)
    if verbose:
        print(f"Mean Absolute Error - predictions: {mae_val_test}")
        print(f"Mean Absolute Error - sorted prediction: {mae_val_test_sorted}")
    else:
        return mae_val_test, mae_val_test_sorted

In [None]:
n_layers_values = [2,3,4]
m_neurons_in_layer = [6,8,10,12]
learning_rate_values = [0.0001,0.001,0.01]

params = {}
params_processed = {}
for lr in learning_rate_values:
    for n in n_layers_values:
        for m in m_neurons_in_layer:
            model = create_model(n_layers=n, n_neurons=m, learning_rate=lr, l2_reg=0.0001)
            model.fit(X_train_scaled, y_train, batch_size = 14, epochs = 25, validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])
            t_val, t_val_sorted = evaluate_network(model, X_val_scaled, y_val, 0)
            t_test, t_test_sorted = evaluate_network(model, X_test_scaled, y_test, 0)
            params[(lr, n, m)] =  (t_val + t_test)/2
            params_processed[(lr, n, m)] =  (t_val_sorted + t_test_sorted)/2

params = dict(sorted(params.items(), key=lambda item: item[1], reverse=False))
params_sorted = dict(sorted(params_processed.items(), key=lambda item: item[1], reverse=False))

In [41]:
print(list(params.keys())[:3])
print(list(params_sorted.keys())[:3])
print(list(params.values())[:3])
print(list(params_sorted.values())[:3])

[(0.001, 3, 10), (0.001, 4, 10), (0.001, 3, 12)]
[(0.001, 3, 10), (0.001, 4, 10), (0.001, 4, 12)]
[1.9249524989016509, 1.926768621165516, 1.931922293720591]
[1.907652195003638, 1.9083798205190394, 1.9137157409653165]


## Testing

In [26]:
'''import'''
dataset = pd.read_csv('jobfair_train.csv')

'''sorting'''
dataset = dataset.sort_values(by='league_id')

'''preprocesing'''
dataset = package.basic_preprocessing(dataset)
dataset = package.Feature_Selection(dataset)
dataset = package.Averaging_by_leagues(dataset)
print(dataset.shape)

(55314, 18)


In [39]:
X_train, y_train, X_val, y_val, X_test, y_test = package.train_val_test_split_adapted_shuffled(dataset)

In [40]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

In [41]:
best_model = Sequential()
best_model.add(Dense(10, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=l2(0.0001)))
best_model.add(Dense(10, activation='relu', kernel_regularizer=l2(0.0001)))
best_model.add(Dense(10, activation='relu', kernel_regularizer=l2(0.0001)))
best_model.add(Dense(1, activation='linear'))
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_absolute_error', metrics=['mae'])

In [None]:
best_model.fit(X_train_scaled, y_train, batch_size = 14, epochs = 25, \
        validation_data=(X_val_scaled, y_val), callbacks=[early_stopping])

In [None]:
raw_predictions_train = best_model.predict(X_train_scaled)
raw_predictions_val = best_model.predict(X_val_scaled)
raw_predictions_test = best_model.predict(X_test_scaled)

In [51]:
from sklearn.metrics import mean_absolute_error
print("train")
mae_train = mean_absolute_error(y_train, raw_predictions_train)
print(f"Mean Absolute Error - predictions - val set: {mae_train}")
print("val")
mae_val = mean_absolute_error(y_val, raw_predictions_val)
print(f"Mean Absolute Error - predictions - val set: {mae_val}")
print("test")
mae_test = mean_absolute_error(y_test, raw_predictions_test)
print(f"Mean Absolute Error - predictions - val set: {mae_test}")

train
Mean Absolute Error - predictions - val set: 1.941983819287729
val
Mean Absolute Error - predictions - val set: 1.9330077300783053
test
Mean Absolute Error - predictions - val set: 1.9464478289764189


**Postprocessing**

In [52]:
predictions_train = package.post_processing_1(raw_predictions_train)
predictions_val = package.post_processing_1(raw_predictions_val)
predictions_test = package.post_processing_1(raw_predictions_test)

In [53]:
from sklearn.metrics import mean_absolute_error
print("train - sorted")
mae_train = mean_absolute_error(y_train, predictions_train)
print(f"Mean Absolute Error - predictions - val set: {mae_train}")
print("val - sorted")
mae_val = mean_absolute_error(y_val, predictions_val)
print(f"Mean Absolute Error - predictions - val set: {mae_val}")
print("test - sorted")
mae_test = mean_absolute_error(y_test, predictions_test)
print(f"Mean Absolute Error - predictions - val set: {mae_test}")

train - sorted
Mean Absolute Error - predictions - val set: 1.9236453201970443
val - sorted
Mean Absolute Error - predictions - val set: 1.9198482932996208
test - sorted
Mean Absolute Error - predictions - val set: 1.9271476032273374


This way MAE is around 1.91 - 1.92