# Test of decision tree regressor with multi-output
Builds on tutorial: https://towardsdatascience.com/recurrent-neural-networks-explained-with-a-real-life-example-and-python-code-e8403a45f5de

In [1]:
import sys
import os
workspace_path = os.path.join(os.path.abspath(".."))
sys.path.append(workspace_path)
workspace_path

'c:\\Users\\Kajsa\\VSCodeProjects\\job_discrimination_sandbox'

In [2]:
import pandas as pd
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import glob
import re
import warnings
warnings.simplefilter("ignore")
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.losses import BinaryCrossentropy
from keras.layers import Dense, LSTM, Dropout, TextVectorization
from keras.layers import GRUCell, Embedding, Bidirectional, Dense, RNN
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, RepeatedKFold


# Local imports
from src.helpers import *

In [3]:
pd.options.display.float_format = "{:.2f}".format

In [4]:
path_to_cleaned_data = os.path.join(workspace_path, "data", "cleaned_data")

In [5]:
# Reading application statistics into dataframe
file_path = os.path.join(path_to_cleaned_data, "bulletins_w_labels_and_content.csv")
df = pd.read_csv(file_path, dtype={"ID": str})

In [6]:
df

Unnamed: 0,ID,Job Description,Apps Received,Female,Male,Unknown_Gender,File Names,Label 60/40,Numeric label 60/40,Label 70/30,Numeric label 70/30,Cleaned text,Text
0,9206,311 DIRECTOR,54,20,31,3,311 DIRECTOR 9206 041814.txt,M,2,N,0,director class code open date annual salary du...,311 DIRECTOR Class Code: 9206 Open Date:...
1,1223,ACCOUNTING CLERK,648,488,152,8,ACCOUNTING CLERK 1223 071318.txt,W,1,W,1,accounting clerk class code open date exam ope...,ACCOUNTING CLERK Class Code: 1223 Open ...
2,7260,AIRPORT MANAGER,51,13,37,1,AIRPORT MANAGER 7260 120216.txt,M,2,M,2,airport manager class code open date exam open...,AIRPORT MANAGER Class Code: 7260 Open D...
3,3227,AIRPORT POLICE LIEUTENANT,48,9,38,1,AIRPORT POLICE LIEUTENANT 3227 091616.txt,M,2,M,2,airport police lieutenant class code open date...,AIRPORT POLICE LIEUTENANT ...
4,2400,AQUARIST,40,15,24,1,AQUARIST 2400 050214.txt,M,2,N,0,aquarist class code open date annual salary ca...,AQUARIST Class Code: 2400 Open Date: 05...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,7840,WASTEWATER TREATMENT LABORATORY MANAGER,16,6,9,1,WASTEWATER TREATMENT LABORATORY MANAGER 7840 1...,M,2,N,0,wastewater treatment laboratory manager class ...,WASTEWATER TREATMENT LABORATORY MANAGER Class...
173,4123,WASTEWATER TREATMENT OPERATOR,125,9,113,3,WASTEWATER TREATMENT OPERATOR 120718.txt,M,2,M,2,wastewater treatment operator class code open ...,WASTEWATER TREATMENT OPERATOR Class Code: ...
174,7857,WATER MICROBIOLOGIST,179,89,82,8,WATER MICROBIOLOGIST 7857 072514 rev073114.txt,N,0,N,0,water microbiologist class code open date revi...,WATER MICROBIOLOGIST Class Code: 7857...
175,3912,WATER UTILITY WORKER,96,2,92,2,WATER UTILITY WORKER 3912 120817.txt,M,2,M,2,water utility worker class code open date exam...,WATER UTILITY WORKER Class Code: 3912 Op...


In [7]:
df.drop(columns=["Label 60/40", "Numeric label 60/40", "Label 70/30", "Numeric label 70/30", "File Names"], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               177 non-null    object
 1   Job Description  177 non-null    object
 2   Apps Received    177 non-null    int64 
 3   Female           177 non-null    int64 
 4   Male             177 non-null    int64 
 5   Unknown_Gender   177 non-null    int64 
 6   Cleaned text     177 non-null    object
 7   Text             177 non-null    object
dtypes: int64(4), object(4)
memory usage: 11.2+ KB


In [9]:
df["Male share"] = df["Male"] / df["Apps Received"]
df["Female share"] = df["Female"] / df["Apps Received"]
df["Unknown share"] = df["Unknown_Gender"] / df["Apps Received"]

In [10]:
df

Unnamed: 0,ID,Job Description,Apps Received,Female,Male,Unknown_Gender,Cleaned text,Text,Male share,Female share,Unknown share
0,9206,311 DIRECTOR,54,20,31,3,director class code open date annual salary du...,311 DIRECTOR Class Code: 9206 Open Date:...,0.57,0.37,0.06
1,1223,ACCOUNTING CLERK,648,488,152,8,accounting clerk class code open date exam ope...,ACCOUNTING CLERK Class Code: 1223 Open ...,0.23,0.75,0.01
2,7260,AIRPORT MANAGER,51,13,37,1,airport manager class code open date exam open...,AIRPORT MANAGER Class Code: 7260 Open D...,0.73,0.25,0.02
3,3227,AIRPORT POLICE LIEUTENANT,48,9,38,1,airport police lieutenant class code open date...,AIRPORT POLICE LIEUTENANT ...,0.79,0.19,0.02
4,2400,AQUARIST,40,15,24,1,aquarist class code open date annual salary ca...,AQUARIST Class Code: 2400 Open Date: 05...,0.60,0.38,0.03
...,...,...,...,...,...,...,...,...,...,...,...
172,7840,WASTEWATER TREATMENT LABORATORY MANAGER,16,6,9,1,wastewater treatment laboratory manager class ...,WASTEWATER TREATMENT LABORATORY MANAGER Class...,0.56,0.38,0.06
173,4123,WASTEWATER TREATMENT OPERATOR,125,9,113,3,wastewater treatment operator class code open ...,WASTEWATER TREATMENT OPERATOR Class Code: ...,0.90,0.07,0.02
174,7857,WATER MICROBIOLOGIST,179,89,82,8,water microbiologist class code open date revi...,WATER MICROBIOLOGIST Class Code: 7857...,0.46,0.50,0.04
175,3912,WATER UTILITY WORKER,96,2,92,2,water utility worker class code open date exam...,WATER UTILITY WORKER Class Code: 3912 Op...,0.96,0.02,0.02


In [11]:
X = df["Text"]
y = df["Male share"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1000)

In [13]:

# Example training data, of dtype `string`.
training_data = np.array(X_train)
test_data = np.array(X_test)

# Create a TextVectorization layer instance. It can be configured to either
# return integer token indices, or a dense token representation (e.g. multi-hot
# or TF-IDF). The text standardization and text splitting algorithms are fully
# configurable.
vectorizer = TextVectorization(output_mode="int")

# Calling `adapt` on an array or dataset makes the layer generate a vocabulary
# index for the data, which can then be reused when seeing new data.
vectorizer.adapt(training_data)

# After calling adapt, the layer is able to encode any n-gram it has seen before
# in the `adapt()` data. Unknown n-grams are encoded via an "out-of-vocabulary"
# token.
integer_data = vectorizer(training_data)
integer_test_data = vectorizer(test_data)
print(integer_test_data)

tf.Tensor(
[[433  56 684 ...   0   0   0]
 [657   4 493 ...   0   0   0]
 [107  56 373 ...   0   0   0]
 ...
 [107 171 426 ...   0   0   0]
 [972 377 624 ...   0   0   0]
 [330   1  65 ...   0   0   0]], shape=(45, 2826), dtype=int64)


In [14]:
integer_data.shape

TensorShape([132, 2442])

In [15]:
integer_test_data.shape

TensorShape([45, 2826])

In [16]:
vocabulary = vectorizer.get_vocabulary()

In [17]:
len(vocabulary)

5451

In [19]:
length = len(integer_data[0])

## Initializing model, training and prediction

### Building the RNN
(using GRU cells and Hyperbolic tangent as activation function)

In [20]:
cell = GRUCell(30, recurrent_activation="tanh")

In [22]:
model = Sequential([
    vectorizer,
    Embedding(
        input_dim=length,
        output_dim=64,
        mask_zero=True  # mask_zero is to handle the variable sequence lengths
    ),
    Bidirectional(RNN(cell)),
    Dense(60, activation="tanh"),
    Dense(1)
])

In [23]:
# Compile the model and use the algorithm Adam as optimization function
model.compile(
    loss=BinaryCrossentropy(from_logits=True),
    optimizer=Adam(1e-2),
    metrics="mean_absolute_error"
    )

In [28]:
# Fit the RNN to the training set
model.fit(integer_data, np.array(y_train), epochs=3)

Epoch 1/3


ValueError: in user code:

    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\layers\preprocessing\text_vectorization.py", line 564, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer "text_vectorization" "                 f"(type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 2442) with rank=2
    
    Call arguments received by layer "text_vectorization" "                 f"(type TextVectorization):
      • inputs=tf.Tensor(shape=(None, 2442), dtype=string)


## Investigate results

In [None]:
y_pred

In [None]:
# Checking if predictions sum to 1, as expected
for line in y_pred:
    print(sum(line))

In [None]:
resultsdf = y_test

resultsdf["Pred Male share"] = [line[0] for line in y_pred]
resultsdf["Pred Female share"] = [line[1] for line in y_pred]
resultsdf["Pred Unknown share"] = [line[2] for line in y_pred]

In [None]:
resultsdf[resultsdf["Pred Male share"] < 0.6]

In [None]:
fig, ax = plt.subplots()
resultsdf.plot(kind="scatter", y="Male share", x="Pred Male share", ax=ax)
ax.plot([0, 1], [0, 1], "--", label="Perfect model")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()
resultsdf.plot(kind="scatter", y="Female share", x="Pred Female share", ax=ax)
ax.plot([0, 1], [0, 1], "--", label="Perfect model")
ax.legend()
plt.show()

In [None]:
resultsdf.describe()

##### Both plots and statistics show that the male share is under-estimated by the model.

## Cross-validation with MAE and RMSE

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
MAE_scores = cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)

In [None]:
MAE_scores = np.absolute(MAE_scores)
print('MAE: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))

In [None]:
# Comparing MAE to average prediction of male share
np.mean(MAE_scores) / 0.67

In [None]:
RMSE_scores = cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1)

In [None]:
RMSE_scores = np.absolute(RMSE_scores)
print('RMSE: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))