# Test of RNN for classification, to be converted to regression
Builds on tutorial: https://keras.io/examples/nlp/text_classification_from_scratch/

In [1]:
import sys
import os
workspace_path = os.path.join(os.path.abspath(".."))
sys.path.append(workspace_path)
workspace_path

'c:\\Users\\Kajsa\\VSCodeProjects\\job_discrimination_sandbox'

In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, TextVectorization, Embedding, LSTM, Bidirectional, Dropout, Conv1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
pd.options.display.float_format = "{:.2f}".format

In [4]:
path_to_cleaned_data = os.path.join(workspace_path, "data", "cleaned_data")

In [5]:
# Reading application statistics into dataframe
file_path = os.path.join(path_to_cleaned_data, "bulletins_w_labels_and_content.csv")
df = pd.read_csv(file_path, dtype={"ID": str})

In [6]:
df

Unnamed: 0,ID,Job Description,Apps Received,Female,Male,Unknown_Gender,File Names,Label 60/40,Numeric label 60/40,Label 70/30,Numeric label 70/30,Cleaned text,Text
0,9206,311 DIRECTOR,54,20,31,3,311 DIRECTOR 9206 041814.txt,M,2,N,0,director class code open date annual salary du...,311 DIRECTOR Class Code: 9206 Open Date:...
1,1223,ACCOUNTING CLERK,648,488,152,8,ACCOUNTING CLERK 1223 071318.txt,W,1,W,1,accounting clerk class code open date exam ope...,ACCOUNTING CLERK Class Code: 1223 Open ...
2,7260,AIRPORT MANAGER,51,13,37,1,AIRPORT MANAGER 7260 120216.txt,M,2,M,2,airport manager class code open date exam open...,AIRPORT MANAGER Class Code: 7260 Open D...
3,3227,AIRPORT POLICE LIEUTENANT,48,9,38,1,AIRPORT POLICE LIEUTENANT 3227 091616.txt,M,2,M,2,airport police lieutenant class code open date...,AIRPORT POLICE LIEUTENANT ...
4,2400,AQUARIST,40,15,24,1,AQUARIST 2400 050214.txt,M,2,N,0,aquarist class code open date annual salary ca...,AQUARIST Class Code: 2400 Open Date: 05...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,7840,WASTEWATER TREATMENT LABORATORY MANAGER,16,6,9,1,WASTEWATER TREATMENT LABORATORY MANAGER 7840 1...,M,2,N,0,wastewater treatment laboratory manager class ...,WASTEWATER TREATMENT LABORATORY MANAGER Class...
173,4123,WASTEWATER TREATMENT OPERATOR,125,9,113,3,WASTEWATER TREATMENT OPERATOR 120718.txt,M,2,M,2,wastewater treatment operator class code open ...,WASTEWATER TREATMENT OPERATOR Class Code: ...
174,7857,WATER MICROBIOLOGIST,179,89,82,8,WATER MICROBIOLOGIST 7857 072514 rev073114.txt,N,0,N,0,water microbiologist class code open date revi...,WATER MICROBIOLOGIST Class Code: 7857...
175,3912,WATER UTILITY WORKER,96,2,92,2,WATER UTILITY WORKER 3912 120817.txt,M,2,M,2,water utility worker class code open date exam...,WATER UTILITY WORKER Class Code: 3912 Op...


In [7]:
# df.drop(columns=["Label 60/40", "Numeric label 60/40", "Label 70/30", "Numeric label 70/30", "File Names"], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   177 non-null    object
 1   Job Description      177 non-null    object
 2   Apps Received        177 non-null    int64 
 3   Female               177 non-null    int64 
 4   Male                 177 non-null    int64 
 5   Unknown_Gender       177 non-null    int64 
 6   File Names           177 non-null    object
 7   Label 60/40          177 non-null    object
 8   Numeric label 60/40  177 non-null    int64 
 9   Label 70/30          177 non-null    object
 10  Numeric label 70/30  177 non-null    int64 
 11  Cleaned text         177 non-null    object
 12  Text                 177 non-null    object
dtypes: int64(6), object(7)
memory usage: 18.1+ KB


In [9]:
df["Male share"] = df["Male"] / df["Apps Received"]
df["Female share"] = df["Female"] / df["Apps Received"]
df["Unknown share"] = df["Unknown_Gender"] / df["Apps Received"]

In [10]:
df

Unnamed: 0,ID,Job Description,Apps Received,Female,Male,Unknown_Gender,File Names,Label 60/40,Numeric label 60/40,Label 70/30,Numeric label 70/30,Cleaned text,Text,Male share,Female share,Unknown share
0,9206,311 DIRECTOR,54,20,31,3,311 DIRECTOR 9206 041814.txt,M,2,N,0,director class code open date annual salary du...,311 DIRECTOR Class Code: 9206 Open Date:...,0.57,0.37,0.06
1,1223,ACCOUNTING CLERK,648,488,152,8,ACCOUNTING CLERK 1223 071318.txt,W,1,W,1,accounting clerk class code open date exam ope...,ACCOUNTING CLERK Class Code: 1223 Open ...,0.23,0.75,0.01
2,7260,AIRPORT MANAGER,51,13,37,1,AIRPORT MANAGER 7260 120216.txt,M,2,M,2,airport manager class code open date exam open...,AIRPORT MANAGER Class Code: 7260 Open D...,0.73,0.25,0.02
3,3227,AIRPORT POLICE LIEUTENANT,48,9,38,1,AIRPORT POLICE LIEUTENANT 3227 091616.txt,M,2,M,2,airport police lieutenant class code open date...,AIRPORT POLICE LIEUTENANT ...,0.79,0.19,0.02
4,2400,AQUARIST,40,15,24,1,AQUARIST 2400 050214.txt,M,2,N,0,aquarist class code open date annual salary ca...,AQUARIST Class Code: 2400 Open Date: 05...,0.60,0.38,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,7840,WASTEWATER TREATMENT LABORATORY MANAGER,16,6,9,1,WASTEWATER TREATMENT LABORATORY MANAGER 7840 1...,M,2,N,0,wastewater treatment laboratory manager class ...,WASTEWATER TREATMENT LABORATORY MANAGER Class...,0.56,0.38,0.06
173,4123,WASTEWATER TREATMENT OPERATOR,125,9,113,3,WASTEWATER TREATMENT OPERATOR 120718.txt,M,2,M,2,wastewater treatment operator class code open ...,WASTEWATER TREATMENT OPERATOR Class Code: ...,0.90,0.07,0.02
174,7857,WATER MICROBIOLOGIST,179,89,82,8,WATER MICROBIOLOGIST 7857 072514 rev073114.txt,N,0,N,0,water microbiologist class code open date revi...,WATER MICROBIOLOGIST Class Code: 7857...,0.46,0.50,0.04
175,3912,WATER UTILITY WORKER,96,2,92,2,WATER UTILITY WORKER 3912 120817.txt,M,2,M,2,water utility worker class code open date exam...,WATER UTILITY WORKER Class Code: 3912 Op...,0.96,0.02,0.02


In [11]:
X = df["Cleaned text"]
y = df["Numeric label 70/30"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1000)

In [13]:
max_features = 5000
embedding_dim = 128
sequence_length = 500

In [14]:
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [15]:
vectorize_layer.adapt(X_train)

In [16]:
len(vectorize_layer.get_vocabulary())

3371

In [17]:
def format_dataset(text, label):
    sentence = vectorize_layer(text)
    return ({"input_ids": text}, label)


train_ds = format_dataset(X_train, y_train)
test_ds = format_dataset(X_test, y_test)

In [21]:
train_ds[1]

143    0
70     2
103    0
4      0
132    2
      ..
1      1
92     2
94     2
71     2
87     2
Name: Numeric label 70/30, Length: 132, dtype: int64

### Building the RNN

In [None]:
# input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")

# x = keras_nlp.layers.TokenAndPositionEmbedding(
#     vocabulary_size=VOCAB_SIZE,
#     sequence_length=MAX_SEQUENCE_LENGTH,
#     embedding_dim=EMBED_DIM,
#     mask_zero=True,
# )(input_ids)

# x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
# x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
# x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)


# x = keras.layers.GlobalAveragePooling1D()(x)
# x = keras.layers.Dropout(0.1)(x)
# outputs = keras.layers.Dense(1, activation="sigmoid")(x)

# fnet_classifier = keras.Model(input_ids, outputs, name="fnet_classifier")

In [25]:
# An integer input for vocab indices.
input_ids = tf.keras.Input(shape=(None,), dtype="int64", name="input_ids")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = Embedding(max_features, embedding_dim)(input_ids)
x = Dropout(0.5)(x)

# Conv1D + global max pooling
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = Dense(128, activation="relu")(x)
x = Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(input_ids)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [26]:
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, epochs=epochs)

Epoch 1/3


NotImplementedError: in user code:

    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Kajsa\VSCodeProjects\job_discrimination_sandbox\venv\lib\site-packages\keras\engine\training.py", line 584, in call
        raise NotImplementedError(

    NotImplementedError: Exception encountered when calling layer "model_1" "                 f"(type Model).
    
    Unimplemented `tf.keras.Model.call()`: if you intend to create a `Model` with the Functional API, please provide `inputs` and `outputs` arguments. Otherwise, subclass `Model` with an overridden `call()` method.
    
    Call arguments received by layer "model_1" "                 f"(type Model):
      • inputs=({'input_ids': 'tf.Tensor(shape=(None, 1), dtype=string)'}, 'tf.Tensor(shape=(None, 1), dtype=int64)')
      • training=True
      • mask=None


## Investigate results

In [None]:
y_pred

In [None]:
# Checking if predictions sum to 1, as expected
for line in y_pred:
    print(sum(line))

In [None]:
resultsdf = y_test

resultsdf["Pred Male share"] = [line[0] for line in y_pred]
resultsdf["Pred Female share"] = [line[1] for line in y_pred]
resultsdf["Pred Unknown share"] = [line[2] for line in y_pred]

In [None]:
resultsdf[resultsdf["Pred Male share"] < 0.6]

In [None]:
fig, ax = plt.subplots()
resultsdf.plot(kind="scatter", y="Male share", x="Pred Male share", ax=ax)
ax.plot([0, 1], [0, 1], "--", label="Perfect model")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()
resultsdf.plot(kind="scatter", y="Female share", x="Pred Female share", ax=ax)
ax.plot([0, 1], [0, 1], "--", label="Perfect model")
ax.legend()
plt.show()

In [None]:
resultsdf.describe()

##### Both plots and statistics show that the male share is under-estimated by the model.

## Cross-validation with MAE and RMSE

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
MAE_scores = cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)

In [None]:
MAE_scores = np.absolute(MAE_scores)
print('MAE: %.3f (%.3f)' % (np.mean(MAE_scores), np.std(MAE_scores)))

In [None]:
# Comparing MAE to average prediction of male share
np.mean(MAE_scores) / 0.67

In [None]:
RMSE_scores = cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1)

In [None]:
RMSE_scores = np.absolute(RMSE_scores)
print('RMSE: %.3f (%.3f)' % (np.mean(RMSE_scores), np.std(RMSE_scores)))