<a href="https://colab.research.google.com/github/Hridoy1750/Developer-Portfolio/blob/main/Fake_Job_Posting_Detection_Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Real & Fake Job Postings
Dataset obtained from Kaggle: https://www.kaggle.com/shivamb/real-or-fake-fake-jobposting-prediction

## Importing Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.python.ops.math_ops import reduce_prod
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
jobs_file = tf.keras.utils.get_file(
    fname="fraudulent_jobs.csv",
    origin="https://uofi.box.com/shared/static/sfw0eqvj7q49vmexpztqke7xzhspevnb.csv"
)

df=pd.read_csv(jobs_file)

df.head()

In [None]:
#drop jobs_id column and update 'work_remote' to 'work_remote's
df.drop(columns=['job_id'], inplace=True)
df.rename(columns={'telecommuting': 'work_remote'}, inplace=True)
df.columns

## Exploring the Dataset

In [None]:
df.info()

In [None]:
df.nunique() #get # of unique values in dataset

In [None]:
df.isna().sum()

In [None]:
df['fraudulent'].value_counts()
#data is very imbalanced

* Text columns are title, location, department, company profile, description, requirements, and benefits, industry, function.
* Categorical columns are employment type, required experience, and required education
* Numeric variables are salary (low/high), work_remote, has company logo, and has questions
* Column to Predict is fraudulent

In [None]:
#separate columns based on their type
text_cols = ['title', 'location', 'department', 'company_profile', 'description', 'requirements', 'benefits']
categorical_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
numeric_cols = ['work_remote', 'has_company_logo', 'has_questions', 'salary_low', 'salary_high']
col_to_predict = ['fraudulent']

## Cleaning Dataset


### Text Data

In [None]:
def move_column(df, col):
  df['Temp_Col'] = df[col]
  df.drop(columns=[col], inplace=True)
  df.rename(columns={'Temp_Col': col}, inplace=True)
  return df

In [None]:
df['employment_type'].fillna('No Data', inplace=True)
df['required_experience'].fillna('No Data', inplace=True)
df['required_education'].fillna('No Data', inplace=True)
df['industry'].fillna('No Data', inplace=True)
df['function'].fillna('No Data', inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.notna().sum()

In [None]:
df[df['description'].isna()] #only 1 row with an empty description -- the post only has title and location and it is a fraud post --> keeping it

In [None]:
#rows that are entirely null in the text columns
null_text = df[(df['location'].isna()) & (df['department'].isna()) & (df['company_profile'].isna()) & (df['requirements'].isna()) & (df['benefits'].isna())]
# 'title', 'location', 'department', 'company_profile', 'description', 'requirements', 'benefits'
print(f"There are {null_text.shape[0]} rows where all text columns aside from 'title' and 'description' are empty. Of these rows, {null_text['fraudulent'].sum()} are fraudulent posts.")

In [None]:
for col in text_cols:
  df[col].fillna(value=" ", inplace=True)
  print(f"Max length of '{col}': {df[col].map(len).max()}") #get maximum number of characters in each column

In [None]:
#combine all text into 1 column - full_text
df['full_text'] = df['title'] + " " + df['location'] + " " + df['department']  + " " + df['company_profile']  + " " + df['description']  + " " + df['requirements']  + " "  + df['benefits']
df['full_text'][0]

In [None]:
print(f"Min length of 'full text': {df['full_text'].map(len).min()}")
print(f"Max length of 'full text': {df['full_text'].map(len).max()}")

In [None]:
def update_text(new_df):
  new_df['full_text'] = new_df['full_text'].str.replace(pat='US', repl='USA', case=True)
  new_df['location'] = new_df['location'].str.replace(pat='US', repl='USA', case=True)
  return new_df

In [None]:
# update US with USA so it does not get mixed up with the word 'us'
df = update_text(df)

#replace urls, email, phone numbers (contact details) ?? maybe presence/absence could indicate a fake post?
# df['full_text'] = df['full_text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', ' ', regex=True).replace(r'#PHONE\S+', '', regex=True).replace(r'#EMAIL\S+', '', regex=True)

In [None]:
#separate 2 separate words that have been put together (i.e. PinterestLoves -> Pinterest Loves)
def space_words(all_text):
  import re
  # print('start', all_text)
  result = re.sub('(?<=[A-Za-z])(?=[A-Z][a-z])', '~', all_text)
  result = re.split('~', result)
  result = ' '.join(result)
  all_text = result
  # print('end',all_text)
  return all_text

In [None]:
texts = ['title', 'location', 'department', 'company_profile', 'description',
       'requirements', 'benefits', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'full_text']

for col in texts:
  df[col] = df[col].apply(space_words)

In [None]:
df.head(1)

In [None]:
def get_unique_words(new_df, col):
  unique_words = []
  idx = list(new_df.index.values)
  for i in idx:
    x = list(new_df[col][i].split())
    unique_words += x
  return unique_words

In [None]:
for col in texts:
  print(f'Number of total words in {col}: {len(get_unique_words(df, col))}')
  print(f'Number of unique words in {col}: {len(set(get_unique_words(df, col)))}\n')

### Salary Range Column


In [None]:
salary_df = df['salary_range'].str.split(pat='-', n=-1, expand=True) #separate salary range into two columns
salary_df[0].unique()[90:100] #months (strings) are included as salaries due to csv interpretation of the salary range

In [None]:
#convert the months in the salary columns into integers
months_to_int = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

for i in range(salary_df[0].shape[0]):
  if salary_df[0][i] in months_to_int.keys():
    salary_df[0][i] = months_to_int[salary_df[0][i]]

for i in range(salary_df[1].shape[0]):
  if salary_df[1][i] in months_to_int.keys():
    salary_df[1][i] = months_to_int[salary_df[1][i]]

In [None]:
#verify conversion of string months to integers
salary_df[0].unique()[90:100]

In [None]:
salary_df.fillna(value=-1, inplace=True)
salary_df[0] = np.array(salary_df[0], dtype='int64')
salary_df[1] = np.array(salary_df[1], dtype='int64')

#move the min/max salary columns to the main dataframe
df['salary_low'] = salary_df[0]
df['salary_high'] = salary_df[1]
df.drop(columns=['salary_range'], inplace=True)

df['salary_low'].replace(-1, np.nan, inplace=True)
df['salary_high'].replace(-1, np.nan, inplace=True)

df['salary_low'] = df['salary_low'].astype('Int64')
df['salary_high'] = df['salary_high'].astype('Int64')

In [None]:
df[df['salary_low'].notnull() & df['salary_high'].notnull()][:3] #some examples showing that the salary was converted back to an int value

#### Replace Missing Values

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

#split df into train/val/test so I can use the mean of salary_low & salary_high from the training data
print('dataframe shape:',df.shape)

train, test_x = train_test_split(df, train_size=0.8, random_state=1, shuffle=True, stratify=df.fraudulent.values)
print("train shape:",train.shape)
print('test shape:',test_x.shape)

train_x, val_x = train_test_split(train, train_size = 0.8, random_state=1, shuffle = True, stratify=train.fraudulent.values)
print("train_x shape:",train_x.shape)
print('val shape:',val_x.shape)

In [None]:
train_x['employment_type'].isna().sum()

In [None]:
# Assuming 'numeric_cols' is a list containing the names of the numeric columns:
train_x[numeric_cols].groupby(train_x['employment_type']).mean()

In [None]:
salary_means = {}
employment_types = list(train_x['employment_type'].unique())
employment_types.sort()
employment_types

In [None]:
low_means = list(train_x.groupby('employment_type')['salary_low'].mean())
high_means = list(train_x.groupby('employment_type')['salary_high'].mean())
salary_means = {}
for i in range(len(employment_types)):
  salary_means[employment_types[i]] = [int(low_means[i]), int(high_means[i])]

print("List of Means (from train_x dataset):", salary_means)

In [None]:
for et in employment_types:
    temp = df[df['employment_type'] == et] #separate the group
    temp['salary_low'].fillna(salary_means[et][0], inplace=True)
    temp['salary_high'].fillna(salary_means[et][1], inplace=True)
    df.update(temp, overwrite=False)

df['salary_low'] = df['salary_low'].astype('int64')
df['salary_high'] = df['salary_high'].astype('int64')
print(df.shape)
df.isna().sum()

In [None]:
df.info() #confirms that the salary columns are int64 and not float64

### Additional Text Processing

In [None]:
df_preproc_done = df.copy()
df_preproc_done.columns

In [None]:
# combine all text columns into the title column
# drop all text columns except full_text
# rename 'title' column to 'full_text'
df['title'] = df['full_text']
df.drop(columns=['location', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'full_text'], inplace=True)
df.rename(columns={'title': 'full_text'}, inplace=True)

#put fraud column at the end of the dataframe
df = move_column(df, 'fraudulent')
df.head(1)

Completed:
* replaced NA values in categorical columns with 'No Data'
* created 'full_text' column with all textual columns combined
* split the 'salary_range' column into 2 for the range --> salary_low, salary_high
* replace 'months' in salary columns with integers
* replace NA values in salary columns with -1, create new columns in main dataframe for salary (low/high), replace -1 with NaN and cast to type Integer64
* move full_text to the front of the dataframe, move fraudulent to the end of the dataframe; and rename both columns

## Create real and fraud datasets (visualization)

In [None]:
df['fraudulent'].value_counts() #data is very imbalanced towards real posts

In [None]:
real_df = df[df['fraudulent'] == 0]
fraud_df = df[df['fraudulent'] == 1]

In [None]:
real_df.info()

In [None]:
fraud_df.info()

In [None]:
print("Min and Max Lengths of 'full_text' in train, val, test ----")
print(f"train - min: {train_x['full_text'].map(len).min()}, max: {train_x['full_text'].map(len).max()}")
print(f"val - min: {val_x['full_text'].map(len).min()}, max: {val_x['full_text'].map(len).max()}")
print(f"test - min: {test_x['full_text'].map(len).min()}, max: {test_x['full_text'].map(len).max()}")

## Visualizing the Dataset

In [None]:
import collections
from collections import Counter
import seaborn as sns

In [None]:
sns.countplot(x='fraudulent', data=df, palette='hls')
plt.show()

In [None]:
df['employment_type'].value_counts()

In [None]:
table=pd.crosstab(df.employment_type, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Employment Type vs Fraud Posts')
plt.xlabel('Employment Type')
plt.ylabel('Fraud Posts')
plt.show()

In [None]:
df['required_experience'].value_counts()

In [None]:
%matplotlib inline

table=pd.crosstab(df.required_experience, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Fraud Posts Based on Required Experience')
plt.xlabel('Required Experience')
plt.ylabel('Fraud Posts')
plt.show()

In [None]:
df['required_education'].value_counts()

In [None]:
table=pd.crosstab(df.required_education, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Required Education vs Fraud Posts')
plt.xlabel('Required Education')
plt.ylabel('Fraud Posts')
plt.show()

In [None]:
df[(df['required_education'] == 'Some High School Coursework') & (df['fraudulent'] == 1)].shape[0] #20 fraud posts out of 27 total posts

In [None]:
#top 20 most common industries - ALL POSTS
industry=list(df['industry'])
counts = Counter(industry).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Industry', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Industry', x='Number of Posts', ax=ax, data=counts_df)
plt.title("Top 20 Most Common Industries Listed in All Job Postings")
plt.show()
#top 5: no data, information technology&services, computer software, internet, education management

In [None]:
#top 20 most common industries - FRAUD POSTS
industry=list(fraud_df['industry'])
counts = Counter(industry).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Industry', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Industry', x='Number of Posts', ax=ax, data=counts_df)
plt.title("Top 20 Most Common Industries Listed in Fake Posts")
plt.show()
#top 5: no data, oil&energy, accounting, hospital&health care, marketing&advertising

In [None]:
#top 20 most common industries - REAL POSTS
industry=list(real_df['industry'])
counts = Counter(industry).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Industry', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Industry', x='Number of Posts', ax=ax, data=counts_df)
plt.title("Top 20 Most Common Industries Listed in Real Posts")
plt.show()
#top 5: no data, information technology&services, computer software, internet, education management

In [None]:
#top 20 most common functions in ALL JOB POSTINGS
function=list(df['function'])
counts = Counter(function).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Function', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Function', x='Number of Posts', ax=ax, data=counts_df)
plt.title('Top 20 Most Common Functions among All Job Postings')
plt.show()
#top 5 = no data, administrative, engineering, customer service, sales

In [None]:
#top 20 most common functions in fraud posts
function=list(fraud_df['function'])
counts = Counter(function).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Function', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Function', x='Number of Posts', ax=ax, data=counts_df)
plt.title('Top 20 Most Common Functions in Fraud Posts')
plt.show()
#top 5 = no data, administrative, engineering, customer service, sales

In [None]:
#top 20 most common functions in real posts
function=list(real_df['function'])
counts = Counter(function).most_common(20)
counts_df = pd.DataFrame(counts)
counts_df.columns=['Function', 'Number of Posts']
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.barplot(y='Function', x='Number of Posts', ax=ax, data=counts_df)
plt.title('Top 20 Most Common Functions in Real Posts')
plt.show()
#top 5 = no data, information technology, sales, engineering, customer service

In [None]:
table=pd.crosstab(df.work_remote, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar')
plt.title('Stacked Bar Chart of "Work_Remote"')
plt.xlabel('Remote Work')
plt.ylabel('Fraud Posts')
plt.show()

In [None]:
table=pd.crosstab(df.has_company_logo, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar')
plt.title('Stacked Bar Chart of "Has Company Logo"')
plt.xlabel('Has Company Logo')
plt.ylabel('Fraud Posts')
plt.show()

In [None]:
table=pd.crosstab(df.has_questions, df.fraudulent)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar')
plt.title('Stacked Bar Chart of "Has Questions"')
plt.xlabel('Has Questions')
plt.ylabel('Fraud Posts')
plt.show()

## One-Hot Encode Categorical Variables

In [None]:
print("Shape before one-hot-encoding:", df.shape)

In [None]:
df = pd.get_dummies(df, columns=['employment_type', 'required_experience', 'required_education', 'industry', 'function']) #categorical_cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function']
print('Shape after one-hot-encoding:',df.shape) #(17880, 205)
df.head(1)

## Split Data into Train, Val, and Test Sets

The data had already been split to get the salary mean for the training set, but now I will re-split it again (random_state = 1 will ensure that results were same as before) now that there are no NA values.

In [None]:
print('dataframe shape:',df.shape)
train, test_x = train_test_split(df, train_size=0.8, random_state=1, shuffle=True, stratify=df.fraudulent.values)
# print("train shape:",train.shape)

train_x, val_x= train_test_split(train, train_size = 0.8, random_state=1, shuffle = True, stratify=train.fraudulent.values)
print("train_x shape:",train_x.shape)
print('val shape:',val_x.shape)
print('test shape:',test_x.shape)

In [None]:
train_x.nunique() #can't see all unique values so going to break it down into 5 sets of 41 columns

In [None]:
train_x.nunique()[:41]

In [None]:
train_x.nunique()[41:82]

In [None]:
train_x.nunique()[82:123]

In [None]:
train_x.nunique()[123:164]

In [None]:
train_x.nunique()[164:]

There are 5 industries where the training data only has 1 option:
* industry_Libraries
* industry_Military
* industry_Package/Freight Delivery
* industry_Shipbuilding
* industry_Wine and Spirits

This means that the models are only learning 1 answer from the training data  for these five columns.


In [None]:
print('industry_Libraries')
print('Train:\n',train_x['industry_Libraries'].value_counts())
print('Val:\n', val_x['industry_Libraries'].value_counts())
print('Test:\n', test_x['industry_Libraries'].value_counts())


In [None]:
print('industry_Military')
print('Train:\n',train_x['industry_Military'].value_counts())
print('Val:\n', val_x['industry_Military'].value_counts())
print('Test:\n', test_x['industry_Military'].value_counts())

In [None]:
print('industry_Package/Freight Delivery')
print('Train:\n',train_x['industry_Package/Freight Delivery'].value_counts())
print('Val:\n', val_x['industry_Package/Freight Delivery'].value_counts())
print('Test:\n', test_x['industry_Package/Freight Delivery'].value_counts())

In [None]:
print('industry_Shipbuilding')
print('Train:\n',train_x['industry_Shipbuilding'].value_counts())
print('Val:\n', val_x['industry_Shipbuilding'].value_counts())
print('Test:\n', test_x['industry_Shipbuilding'].value_counts())

In [None]:
print('industry_Wine and Spirits')
print('Train:\n',train_x['industry_Wine and Spirits'].value_counts())
print('Val:\n', val_x['industry_Wine and Spirits'].value_counts())
print('Test:\n', test_x['industry_Wine and Spirits'].value_counts())

In [None]:
print(val_x.nunique()[:41])
print(val_x.nunique()[41:82])
print(val_x.nunique()[82:123])
print(val_x.nunique()[123:164])
print(val_x.nunique()[164:])

In [None]:
print(test_x.nunique()[:41])
print(test_x.nunique()[41:82])
print(test_x.nunique()[82:123])
print(test_x.nunique()[123:164])
print(test_x.nunique()[164:])

In [None]:
train_x['fraudulent'].value_counts()

In [None]:
sns.countplot(x='fraudulent', data=train_x, palette='hls')
plt.show()

## Reducing the Training Dataset



In [None]:
train_x['fraudulent'].value_counts() #I will keep all 554 fraudulent posts and instead just subsample the real posts using the sample(frac=0.5, random_state=1)

In [None]:
def undersample_data(new_df, col_name, val, fraction, seed):
  undersampled = new_df[new_df[col_name] == val]
  undersampled=undersampled.sample(frac=fraction, random_state=seed)
  # print(undersampled.shape)
  # print(undersampled['fraudulent'].value_counts()) #making sure only real data has been subsampled
  return undersampled

In [None]:
fake_train = train_x[train_x['fraudulent'] == 1]
fake_train.shape

In [None]:
under_train_x = pd.concat([undersampled, fake_train], ignore_index=True) # use pd.concat to combine DataFrames

In [None]:
undersampled = undersample_data(train_x, 'fraudulent', 0, 0.5, 1)
under_train_x = pd.concat([undersampled, fake_train], ignore_index=True) # use pd.concat to combine DataFrames
under_train_x['fraudulent'].value_counts()


In [None]:
under_train_x_labels = under_train_x['fraudulent']

In [None]:
sns.countplot(x='fraudulent', data=under_train_x, palette='hls') #approximately a 90/10 ratio
plt.show()

## Base Model - RNN with Embedding Layer

### Numeric + Text Inputs

#### Building the Model with Numeric and Text Data

In [None]:
#TextVectorization layer:
#turns raw strings into an encoded representation that can be read by an Embedding layer or Dense layer
#TextVectorization(max_tokens, standardize, split, ngrams, output_mode, output_sequence_length, pad_to_max_tokens)

def vectorize(text):
  vectorize_layer = TextVectorization(
      output_mode='int',
      max_tokens=128)
      #output_sequence_length=512 gave 125,764 total vocabulary size

  vectorize_layer.adapt(np.asarray(text)) #builds vocabulary
  return vectorize_layer

In [None]:
text=train_x['full_text'].values
text[:10]

In [None]:
vocab = vectorize(text)
vocab = list(vocab.get_vocabulary())
print(f"Total Length of vocab: {len(vocab)}\nTop 20 words in vocab: {vocab[:20]}\nLast 20 words in vocab: {vocab[-20:]}")

In [None]:
#Embedding Layer:
#Turns positive integers (indexes) into dense vectors of fixed size
# This layer can only be used as the first layer in a model

# tf.keras.layers.Embedding(
#     input_dim,
#     output_dim,
#     embeddings_initializer="uniform",
#     embeddings_regularizer=None,
#     activity_regularizer=None,
#     embeddings_constraint=None,
#     mask_zero=False,
#     input_length=None,
#     **kwargs
# )

In [None]:
train_text = train_x['full_text'].to_numpy()
val_text=val_x['full_text'].to_numpy()
test_text = test_x['full_text'].to_numpy()
undersampled_text = under_train_x['full_text'].to_numpy()

print(train_text.shape, val_text.shape, test_text.shape, undersampled_text.shape)

In [None]:
train_numeric = train_x.drop(['full_text', 'fraudulent'], axis=1).to_numpy()
val_numeric=val_x.drop(['full_text', 'fraudulent'], axis=1).to_numpy()
test_numeric = test_x.drop(['full_text', 'fraudulent'], axis=1).to_numpy()
undersampled_numeric = under_train_x.drop(['full_text', 'fraudulent'], axis=1).to_numpy()

print(train_numeric.shape, val_numeric.shape, test_numeric.shape, undersampled_numeric)

In [None]:
train_labels = train_x['fraudulent'].to_numpy()
val_labels = val_x['fraudulent'].to_numpy()
test_labels = test_x['fraudulent'].to_numpy()
undersampled_labels = under_train_x['fraudulent'].to_numpy()

print(train_labels.shape, val_labels.shape, test_labels.shape, undersampled_labels.shape)

In [None]:
print('Text:',train_text[0])
print('Numeric Variables:\n',train_numeric[0])
print('Label:',train_labels[0])

In [None]:
metrics_list = [
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.FalseNegatives(name='fn'),
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc')
  ]

def build_rnn_model():
  text_inputs=tf.keras.Input(shape=(1,), dtype=tf.string, name='text_inputs')
  text_outputs = vectorize(text_inputs) #text_outputs.dtype = 'tf.int64'
  i = layers.Embedding(len(vectorize.get_vocabulary()), 128, input_length=128, mask_zero=True)(text_outputs) #changed from input_length=512
  i=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(i)

  numeric_inputs = tf.keras.Input(shape=(203,), name='numeric_inputs')
  j=layers.Dense(128, activation='relu', name='dense_j1',
                kernel_initializer=tf.keras.initializers.he_normal(seed=1))(numeric_inputs)
  j=layers.Dropout(0.1)(j)
  j=layers.Dense(128, activation='relu', name='dense_j2',
                kernel_initializer=tf.keras.initializers.he_normal(seed=1))(j) #j.dtype='float32'

  inputs = keras.layers.concatenate([i, j])
  x = layers.Dense(32, activation='relu', name='layer1')(inputs)
  # x=layers.Dropout(0.2, name='dropout1')(x)
  # x = layers.Dense(32, activation='relu', name='layer2')(x)
  # x=layers.Dropout(0.2, name='dropout2')(x)
  output=layers.Dense(1, activation='sigmoid', name='final_output')(x)

  rnn_model = Model(inputs=[text_inputs, numeric_inputs], outputs=[output])
  return rnn_model


In [None]:

model_rnn = build_rnn_model()
print(model_rnn.summary())

In [None]:
keras.utils.plot_model(model_rnn)

#### Training the model with text + numeric variables

In [None]:
from statistics import mean

In [None]:
model_rnn.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    metrics=metrics_list)

In [None]:
# Convert all numeric columns in train_numeric and val_numeric to float32
train_numeric = train_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
val_numeric = val_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
# In your model.fit call:
history = model_rnn.fit([train_text, train_numeric], train_labels, epochs=5, validation_data=([val_text, val_numeric], val_labels))

In [None]:
history.history

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
auc = history.history['auc']
val_auc = history.history['val_auc']
epochs = range(1, len(auc) + 1)
plt.figure()
plt.plot(epochs, auc, label='Training AUC')
plt.plot(epochs, val_auc, label='Validation AUC')
plt.title('Training and validation AUC')
plt.legend()
plt.show()

In [None]:
fp = history.history['fp']
val_fp = history.history['val_fp']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, fp, label='Training False Positives')
plt.plot(epochs, val_fp, label='Validation False Positives')
plt.title('Training and validation False Positives')
plt.legend()
plt.show()

In [None]:
fn = history.history['fn']
val_fn = history.history['val_fn']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, label='Training False Negatives')
plt.plot(epochs, val_loss, label='Validation False Negatives')
plt.title('Training and validation False Negatives')
plt.legend()
plt.show()

#### Comparing these results with training the model on undersampled training data (validation data remains the same)

In [None]:
model_rnn.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    metrics=metrics_list)

In [None]:
# Convert all numeric columns in train_numeric and val_numeric to float32
undersampled_numeric = under_train_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
val_numeric = val_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
# In your model.fit call:
history = model_rnn.fit([undersampled_text, undersampled_numeric], undersampled_labels, epochs=5, validation_data=([val_text, val_numeric], val_labels))

In [None]:
history.history

In [None]:
# Convert all numeric columns in train_numeric and val_numeric to float32
undersampled_numeric = under_train_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
val_numeric = val_x.drop(['full_text', 'fraudulent'], axis=1).astype(np.float32).to_numpy()
#undersampled_text and val_text should already be strings, but confirming here
undersampled_text = under_train_x['full_text'].astype(str).to_numpy()
val_text = val_x['full_text'].astype(str).to_numpy()

# In your model.fit call:
history = model_rnn.fit([undersampled_text, undersampled_numeric], undersampled_labels, epochs=5, validation_data=([val_text, val_numeric], val_labels))

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss on Undersampled Training Data')
plt.legend()
plt.show()

In [None]:
auc = history.history['auc']
val_auc = history.history['val_auc']
epochs = range(1, len(auc) + 1)
plt.figure()
plt.plot(epochs, auc, label='Training AUC')
plt.plot(epochs, val_auc, label='Validation AUC')
plt.title('Training and validation AUC on Undersampled Training Data')
plt.legend()
plt.show()

In [None]:
fp = history.history['fp']
val_fp = history.history['val_fp']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, fp, label='Training False Positives')
plt.plot(epochs, val_fp, label='Validation False Positives')
plt.title('Training and validation False Positives on Undersampled Training Data')
plt.legend()
plt.show()

In [None]:
fn = history.history['fn']
val_fn = history.history['val_fn']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, label='Training False Negatives')
plt.plot(epochs, val_loss, label='Validation False Negatives')
plt.title('Training and validation False Negatives on Undersampled Training Data')
plt.legend()
plt.show()

The mean undersampled validation AUC was only barely lower than the mean validation AUC for the entire dataset:
* All Training Data: 0.5054
* Undersampled Training Data: 0.5010

That being said, an AUC of roughly 50% is not good at all.

#### Retraining the entire model for the test set with text + numeric variables
Note: I ran this section initially but due to the low AUC values during the training phase, I know that the text-only data significantly improved the AUC so I will not rerun this section. All code was left but it has been commented out.

In [None]:
# total_train_numeric=np.concatenate((train_numeric,val_numeric))
# total_train_text=np.concatenate((train_text,val_text))
# total_train_labels= np.concatenate((train_labels, val_labels))

# model_rnn2 = build_rnn_model()
# model_rnn2.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
#                     loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#                     metrics=metrics_list)
# print(model_rnn2.summary())

In [None]:
# keras.utils.plot_model(model_rnn2)

In [None]:
# history2 = model_rnn2.fit([total_train_text, total_train_numeric], total_train_labels, epochs=5, validation_data=([test_text, test_numeric], test_labels))

In [None]:
# history2.history

In [None]:
# combo_auc = history2.history['auc']
# test_auc = history2.history['val_auc']
# print(f"Train+Val Average Mean: {mean(combo_auc)}; Test Average Mean: {mean(test_auc)}")

In [None]:
# loss2 = history2.history['loss']
# val_loss2 = history2.history['val_loss']
# epochs2 = range(1, len(loss2) + 1)
# plt.figure()
# plt.plot(epochs2, loss2, label='Training loss')
# plt.plot(epochs2, val_loss2, label='Validation loss')
# plt.title('Train+Validation and Testing loss (Text Only Data)')
# plt.legend()
# plt.show()

In [None]:
# auc2 = history2.history['auc']
# val_auc2 = history2.history['val_auc']
# epochs2 = range(1, len(auc2) + 1)
# plt.figure()
# plt.plot(epochs2, auc2, label='Training AUC')
# plt.plot(epochs2, val_auc2, label='Validation AUC')
# plt.title('Train+Validation and Testing AUC (Text Only Data)')
# plt.legend()
# plt.show()

### Text-Only Inputs

#### Create Text-Only Data \--

In [None]:
df_text_only = df_preproc_done.copy()
df_text_only.drop(columns=['work_remote', 'has_company_logo', 'has_questions', 'salary_low', 'salary_high'],inplace=True)
df_text_only.head(3)
df_text = df_text_only.copy()

In [None]:
df_text.title = df_text['title'] + ' ' + df_text['location']  + ' ' + df_text['description']
df_text.department = df_text['department'] + ' ' + df_text['employment_type'] + ' ' + df_text['required_experience'] + ' ' + df_text['industry'] + ' ' + df_text['function'] + ' ' + df_text['required_education']
df_text.drop(columns=['location', 'description', 'employment_type', 'required_experience', 'industry', 'function', 'required_education'], inplace=True)

In [None]:
#replace 'No Data' with empty space ??
df_text.rename(columns={'title': 'title_loc_description', 'department': 'cat_vars'}, inplace=True)
df_text.head()

In [None]:
#RECREATE TRAIN/VAL/TEST SPLIT WITH TEXT-ONLY DATA

print('dataframe shape:',df_text.shape)
train2, test_t = train_test_split(df_text, train_size=0.8, random_state=1, shuffle=True, stratify=df_text.fraudulent.values)
# print("train shape:",train.shape)

train_t, val_t= train_test_split(train2, train_size = 0.8, random_state=1, shuffle = True, stratify=train2.fraudulent.values)
print("train_t shape:",train_t.shape)
print('val_t shape:',val_t.shape)
print('test_t shape:',test_t.shape)

In [None]:
text_combos = df_text.columns[:-2] #removes full_text and fraudulent columns
text_combos

In [None]:
#title_loc_description,	cat_vars,	company_profile,	requirements,	benefits
train_text1 = train_t['title_loc_description'].to_numpy()
train_text2 = train_t['cat_vars'].to_numpy()
train_text3 = train_t['company_profile'].to_numpy()
train_text4 = train_t['requirements'].to_numpy()
train_text5 = train_t['benefits'].to_numpy()
train_labels = train_t['fraudulent'].to_numpy()

val_text1 = val_t['title_loc_description'].to_numpy()
val_text2 = val_t['cat_vars'].to_numpy()
val_text3 = val_t['company_profile'].to_numpy()
val_text4 = val_t['requirements'].to_numpy()
val_text5 = val_t['benefits'].to_numpy()
val_labels = val_t['fraudulent'].to_numpy()

test_text1 = test_t['title_loc_description'].to_numpy()
test_text2 = test_t['cat_vars'].to_numpy()
test_text3 = test_t['company_profile'].to_numpy()
test_text4 = test_t['requirements'].to_numpy()
test_text5 = test_t['benefits'].to_numpy()
test_labels = test_t['fraudulent'].to_numpy()

#### Building Text-Only RNN Model


In [None]:
text = df_text['title_loc_description'].values
vectorize1=vectorize(text)

text = df_text['cat_vars'].values
vectorize2=vectorize(text)

text = df_text['company_profile'].values
vectorize3=vectorize(text)

text = df_text['requirements'].values
vectorize4=vectorize(text)

text = df_text['benefits'].values
vectorize5=vectorize(text)


In [None]:
def vectorize(text):
  vectorize_layer = TextVectorization(
      output_mode='int',
      max_tokens=128)
  #builds vocabulary
  vectorize_layer.adapt(np.asarray(text))
  return vectorize_layer #return the layer instance, not the result of calling it


#title_loc_description,	cat_vars,	company_profile,	requirements,	benefits
text_data = [df_text['title_loc_description'].values, df_text['cat_vars'].values, df_text['company_profile'].values, df_text['requirements'].values, df_text['benefits'].values]
vectorize1=vectorize(text_data[0])
vectorize2=vectorize(text_data[1])
vectorize3=vectorize(text_data[2])
vectorize4=vectorize(text_data[3])
vectorize5=vectorize(text_data[4])

vects = [vectorize1, vectorize2, vectorize3, vectorize4, vectorize5]
for i, v in enumerate(vects):
  #The adapt call should already be performed within the vectorize function
  #v.adapt(text_data[i]) #Use the appropriate text data for each vectorizer
  vocab = v.get_vocabulary() #v is now a TextVectorization layer
  print(f"Total Length of vocab: {len(vocab)}\nTop 20 words in vocab: {vocab[:20]}\nLast 20 words in vocab: {vocab[-20:]}\n")

check 301

In [None]:
metrics_list = [
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.FalseNegatives(name='fn'),
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc')
  ]

def build_text_rnn():
  inputs1=tf.keras.Input(shape=(1,), dtype=tf.string, name='inputs1')
  outputs1 = vectorize1(inputs1) #text_outputs.dtype = 'tf.int64'
  a = layers.Embedding(len(vectorize1.get_vocabulary()), 128, input_length=128, mask_zero=True)(outputs1)
  a=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(a)

  inputs2=tf.keras.Input(shape=(1,), dtype=tf.string, name='inputs2')
  outputs2 = vectorize2(inputs2) #text_outputs.dtype = 'tf.int64'
  b = layers.Embedding(len(vectorize2.get_vocabulary()), 128, input_length=128, mask_zero=True)(outputs2)
  b=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(b)

  inputs3=tf.keras.Input(shape=(1,), dtype=tf.string, name='inputs3')
  outputs3 = vectorize3(inputs3) #text_outputs.dtype = 'tf.int64'
  c = layers.Embedding(len(vectorize3.get_vocabulary()), 128, input_length=128, mask_zero=True)(outputs3)
  c=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(c)

  inputs4=tf.keras.Input(shape=(1,), dtype=tf.string, name='inputs4')
  outputs4 = vectorize4(inputs4) #text_outputs.dtype = 'tf.int64'
  d = layers.Embedding(len(vectorize4.get_vocabulary()), 128, input_length=128, mask_zero=True)(outputs4)
  d=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(d)

  inputs5=tf.keras.Input(shape=(1,), dtype=tf.string, name='inputs5')
  outputs5 = vectorize5(inputs5) #text_outputs.dtype = 'tf.int64'
  e = layers.Embedding(len(vectorize5.get_vocabulary()), 128, input_length=128, mask_zero=True)(outputs5)
  e=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(e)


  inputs = keras.layers.concatenate([a,b,c,d,e])
  x = layers.Dense(32, activation='relu', name='layer1')(inputs)
  # x=layers.Dropout(0.2, name='dropout1')(x)
  # x = layers.Dense(32, activation='relu', name='layer2')(x)
  # x=layers.Dropout(0.2, name='dropout2')(x)
  output=layers.Dense(1, activation='sigmoid', name='final_output')(x)

  rnn_model = Model(inputs=[inputs1, inputs2, inputs3, inputs4, inputs5], outputs=[output])
  return rnn_model

model_rnn = build_text_rnn()
print(model_rnn.summary())
model_rnn.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    metrics=metrics_list)

In [None]:
keras.utils.plot_model(model_rnn)

#### Training the Text-Only Model

In [None]:
history = model_rnn.fit([train_text1, train_text2,train_text3,train_text4,train_text5], train_labels, epochs=5, validation_data=([val_text1, val_text2,val_text3,val_text4,val_text5], val_labels))

In [None]:
history.history

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.figure()
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and validation loss (Text Only Data)')
plt.legend()
plt.show()

In [None]:
auc = history.history['auc']
val_auc = history.history['val_auc']
epochs = range(1, len(auc) + 1)
plt.figure()
plt.plot(epochs, auc, label='Training AUC')
plt.plot(epochs, val_auc, label='Validation AUC')
plt.title('Training and validation AUC (Text Only Data)')
plt.legend()
plt.show()

In [None]:
fp = history.history['fp']
val_fp = history.history['val_fp']
epochs = range(1, len(fp) + 1)
plt.figure()
plt.plot(epochs, fp, label='Training False Positives')
plt.plot(epochs, val_fp, label='Validation False Positives')
plt.title('Training and validation False Positives (Text Only Data)')
plt.legend()
plt.show()

In [None]:
fn = history.history['fn']
val_fn = history.history['val_fn']
epochs = range(1, len(fn) + 1)
plt.figure()
plt.plot(epochs, fn, label='Training False Negatives')
plt.plot(epochs,  val_fn, label='Validation False Negatives')
plt.title('Training and validation False Negatives (Text Only Data)')
plt.legend()
plt.show()

#### Applying RNN on Test Set (Text-Only)

In [None]:
all_text1=np.concatenate((train_text1, val_text1))
all_text2=np.concatenate((train_text2, val_text2))
all_text3=np.concatenate((train_text3, val_text3))
all_text4=np.concatenate((train_text4, val_text4))
all_text5=np.concatenate((train_text5, val_text5))

all_labels= np.concatenate((train_labels, val_labels))

model_rnn2 = build_text_rnn()
model_rnn2.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    metrics=metrics_list)
print(model_rnn2.summary())

In [None]:
history2 = model_rnn2.fit([all_text1, all_text2, all_text3, all_text4, all_text5], all_labels, epochs=5, validation_data=([test_text1, test_text2, test_text3, test_text4, test_text5], test_labels))

In [None]:
history2.history

In [None]:
loss2 = history2.history['loss']
val_loss2 = history2.history['val_loss']
epochs2 = range(1, len(loss2) + 1)
plt.figure()
plt.plot(epochs2, loss2, label='Training loss')
plt.plot(epochs2, val_loss2, label='Validation loss')
plt.title('Train+Validation and Testing loss (Text Only Data)')
plt.legend()
plt.show()

In [None]:
auc2 = history2.history['auc']
val_auc2 = history2.history['val_auc']
epochs2 = range(1, len(auc2) + 1)
plt.figure()
plt.plot(epochs2, auc2, label='Training AUC')
plt.plot(epochs2, val_auc2, label='Validation AUC')
plt.title('Train+Validation and Testing AUC (Text Only Data)')
plt.legend()
plt.show()

In [None]:
fp2 = history.history['fp']
val_fp2 = history.history['val_fp']
epochs = range(1, len(fp2) + 1)
plt.figure()
plt.plot(epochs, fp2, label='Training False Positives')
plt.plot(epochs, val_fp2, label='Validation False Positives')
plt.title('Training+Validation / Testing False Positives (Text Only Data)')
plt.legend()
plt.show()

In [None]:
fn2 = history.history['fn']
val_fn2 = history.history['val_fn']
epochs = range(1, len(fn2) + 1)
plt.figure()
plt.plot(epochs, fn2, label='Training False Negatives')
plt.plot(epochs,  val_fn2, label='Validation False Negatives')
plt.title('Training+Validation / Testing False Negatives (Text Only Data)')
plt.legend()
plt.show()

## Model with BERT  

### Installing Dependencies & Exploring BERT \--

In [None]:
#BERT dependencies
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [None]:
import tensorflow_hub as hub
import tensorflow_text as text # A dependency of the preprocessing model
from official.nlp import optimization  # to create AdamW optmizer

tf.get_logger().setLevel('ERROR') #filters out all messages

In [None]:
# classic_bert_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4' #bert_en_uncased_L-12_H-768_A-12
small_bert_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2' ##considered small bert
# small_bert2_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2' #considered tiny bert

preprocess_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" #same preprocess layer for all bert models that I will use

# print('Classic BERT model selected            :', classic_bert_encoder)
print('Small BERT model selected           :', small_bert_encoder)
# print('Small BERT model #2 selected           :', small_bert2_encoder)
print('Preprocessing model auto-selected      :', preprocess_bert)

In [None]:
text_test1 = [train_text1[0][:100]]

In [None]:
bert_preprocess_model = hub.KerasLayer(preprocess_bert)
text_preprocessed = bert_preprocess_model(text_test1)

print(f'Keys            : {list(text_preprocessed.keys())}')
print(f'Word Ids Shape  : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids        : {text_preprocessed["input_word_ids"][0, :30]}')
print(f'Input Mask Shape: {text_preprocessed["input_mask"].shape}')
print(f'Input Mask      : {text_preprocessed["input_mask"][0, :30]}')
print(f'Type Ids Shape  : {text_preprocessed["input_type_ids"].shape}')
print(f'Type Ids        : {text_preprocessed["input_type_ids"][0, :30]}')

In [None]:
bert_encoders = [small_bert_encoder] #small_bert2_encoder, classic_bert_encoder

for be in bert_encoders:
  bert_model = hub.KerasLayer(be) #The BERT models return a map with 3 important keys: pooled_output, sequence_output, encoder_outputs
  bert_results = bert_model(text_preprocessed)

  print(f'Loaded BERT: {be}')
  print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}') #The shape is [batch_size, H]. You can think of this as an embedding for the entire movie review.
  print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}') #For the fine-tuning you are going to use the pooled_output array
  print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
  print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}\n')

### Defining the Model \--

In [None]:
def make_bert_preprocess_model(sentence_features, seq_length=128):
  """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: a list with the names of string-valued features.
    seq_length: an integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

  input_segments = [tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft) for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load(preprocess_bert)
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Optional: Trim segments in a smart way to fit seq_length.
  # Simple cases (like this example) can skip this step and let
  # the next step apply a default truncation to approximately equal lengths.
  truncated_segments = segments

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

In [None]:
#PREPROCESSING FOR BERT - TRAIN DATA
preproc_model1 = make_bert_preprocess_model(['title_loc_description'])
text = [train_text1]
preproc1 = preproc_model1(text)

preproc_model2 = make_bert_preprocess_model(['cat_vars'])
text = [train_text2]
preproc2 = preproc_model2(text)

preproc_model3 = make_bert_preprocess_model(['company_profile'])
text = [train_text3]
preproc3 = preproc_model3(text)

preproc_model4 = make_bert_preprocess_model(['requirements'])
text = [train_text4]
preproc4 = preproc_model4(text)

preproc_model5 = make_bert_preprocess_model(['benefits'])
text = [train_text5]
preproc5 = preproc_model5(text)

In [None]:
print('preproc1 - training')
print('Keys           : ', list(preproc1.keys()))
print('Shape Word Ids : ', preproc1['input_word_ids'].shape)
print('Word Ids       : ', preproc1['input_word_ids'][0, :20])
print('Shape Mask     : ', preproc1['input_mask'].shape)
print('Input Mask     : ', preproc1['input_mask'][0, :20])
print('Shape Type Ids : ', preproc1['input_type_ids'].shape)
print('Type Ids       : ', preproc1['input_type_ids'][0, :20])

# print()
# print('preproc2 - training')
# print('Keys           : ', list(preproc2.keys()))
# print('Shape Word Ids : ', preproc2['input_word_ids'].shape)
# print('Word Ids       : ', preproc2['input_word_ids'][0, :16])
# print('Shape Mask     : ', preproc2['input_mask'].shape)
# print('Input Mask     : ', preproc2['input_mask'][0, :16])
# print('Shape Type Ids : ', preproc2['input_type_ids'].shape)
# print('Type Ids       : ', preproc2['input_type_ids'][0, :16])

# print()
# print('preproc3 - training')
# print('Keys           : ', list(preproc3.keys()))
# print('Shape Word Ids : ', preproc3['input_word_ids'].shape)
# print('Word Ids       : ', preproc3['input_word_ids'][0, :16])
# print('Shape Mask     : ', preproc3['input_mask'].shape)
# print('Input Mask     : ', preproc3['input_mask'][0, :16])
# print('Shape Type Ids : ', preproc3['input_type_ids'].shape)
# print('Type Ids       : ', preproc3['input_type_ids'][0, :16])

# print()
# print('preproc4 - training')
# print('Keys           : ', list(preproc4.keys()))
# print('Shape Word Ids : ', preproc4['input_word_ids'].shape)
# print('Word Ids       : ', preproc4['input_word_ids'][0, :16])
# print('Shape Mask     : ', preproc4['input_mask'].shape)
# print('Input Mask     : ', preproc4['input_mask'][0, :16])
# print('Shape Type Ids : ', preproc4['input_type_ids'].shape)
# print('Type Ids       : ', preproc4['input_type_ids'][0, :16])

# print()
# print('preproc5 - training')
# print('Keys           : ', list(preproc5.keys()))
# print('Shape Word Ids : ', preproc5['input_word_ids'].shape)
# print('Word Ids       : ', preproc5['input_word_ids'][0, :16])
# print('Shape Mask     : ', preproc5['input_mask'].shape)
# print('Input Mask     : ', preproc5['input_mask'][0, :16])
# print('Shape Type Ids : ', preproc5['input_type_ids'].shape)
# print('Type Ids       : ', preproc5['input_type_ids'][0, :16])

In [None]:
tf.keras.utils.plot_model(preproc_model1)

In [None]:
tf.keras.utils.plot_model(preproc_model2)

In [None]:
tf.keras.utils.plot_model(preproc_model3)

In [None]:
tf.keras.utils.plot_model(preproc_model4)

In [None]:
tf.keras.utils.plot_model(preproc_model5)

In [None]:
#PREPROCESSING FOR BERT - VAL DATA
val_texts=[val_text1, val_text2, val_text3, val_text4, val_text5]
val_preproc1 = preproc_model1([val_texts[0]]) #title_loc_description
val_preproc2 = preproc_model2([val_texts[1]]) #cat_vars
val_preproc3 = preproc_model3([val_texts[2]]) #company_profile
val_preproc4 = preproc_model4([val_texts[3]]) #requirements
val_preproc5 = preproc_model5([val_texts[4]]) #benefits

#PREPROCESSING FOR BERT - TEST DATA
test_texts=[test_text1, test_text2, test_text3, test_text4, test_text5]
test_preproc1 = preproc_model1([test_texts[0]]) #title_loc_description
test_preproc2 = preproc_model2([test_texts[1]]) #cat_vars
test_preproc3 = preproc_model3([test_texts[2]]) #company_profile
test_preproc4 = preproc_model4([test_texts[3]]) #requirements
test_preproc5 = preproc_model5([test_texts[4]]) #benefits

In [None]:
def build_bert_model(encoder):
  inputs1 = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids1'),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask1'),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids1')
    )
  encoder1 = hub.KerasLayer(encoder, trainable=True, name='BERT_encoder1')
  net1=encoder1(inputs1)['pooled_output']

  inputs2 = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids2'),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask2'),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids2')
    )
  encoder2 = hub.KerasLayer(encoder, trainable=True, name='BERT_encoder2')
  net2=encoder2(inputs2)['pooled_output']

  inputs3 = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids3'),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask3'),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids3')
    )
  encoder3 = hub.KerasLayer(encoder, trainable=True, name='BERT_encoder3')
  net3=encoder3(inputs3)['pooled_output']

  inputs4 = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids4'),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask4'),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids4')
    )
  encoder4 = hub.KerasLayer(encoder, trainable=True, name='BERT_encoder4')
  net4=encoder4(inputs4)['pooled_output']

  inputs5 = dict(
        input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids5'),
        input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask5'),
        input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids5')
    )
  encoder5 = hub.KerasLayer(encoder, trainable=True, name='BERT_encoder5')
  net5=encoder5(inputs5)['pooled_output']

  inputs = tf.keras.layers.concatenate([net1, net2, net3, net4, net5])
  x = tf.keras.layers.Dropout(0.1)(inputs)
  output = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(x)
  return tf.keras.Model(inputs=[inputs1, inputs2, inputs3, inputs4, inputs5], outputs=[output])

In [None]:
small_bert_model = build_bert_model(small_bert_encoder)
print(small_bert_model.summary())

In [None]:
tf.keras.utils.plot_model(small_bert_model)

### Fine-Tuning BERT Model \--

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

metrics_list = [
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.FalseNegatives(name='fn'),
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.AUC(name='auc')
  ]

metrics = metrics_list

In [None]:
epochs = 1
steps_per_epoch = len(train_t)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
print(steps_per_epoch)
print(num_train_steps)
print(num_warmup_steps)

### Small BERT model (small_bert/bert_en_uncased_L-4_H-512_A-4)

In [None]:
small_bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
#training model with small bert
history = small_bert_model.fit([preproc1, preproc2, preproc3, preproc4, preproc5], train_labels, epochs=1, validation_data=([val_preproc1, val_preproc2, val_preproc3, val_preproc4, val_preproc5], val_labels))

In [None]:
history.history

### Classic BERT model (bert_en_uncased_L-12_H-768_A-12) --> CANNOT RUN ON COLAB CPU

In [None]:
# bert_model = build_bert_model(classic_bert_encoder)
# print(bert_model.summary())

In [None]:
# bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
# CANNOT RUN -- BERT MODEL IS TOO LARGE FOR COLAB
# history = bert_model.fit([preproc1, preproc2, preproc3, preproc4, preproc5], train_labels, epochs=1, validation_data=([val_preproc1, val_preproc2, val_preproc3, val_preproc4, val_preproc5], val_labels))

In [None]:
# history.history

### Applying the Small-BERT model to the testing set

In [None]:
small_bert_model.evaluate([test_preproc1, test_preproc2, test_preproc3, test_preproc4, test_preproc5], test_labels)