In [106]:
import numpy as np
import pandas as pd
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix


np.random.seed(7)


from lxml import etree
from lxml import html
import os
from collections import OrderedDict
while os.path.basename(os.getcwd()) != 'Labor_Discrimination':
    os.chdir('..')

## Formulating our Representation

In [2]:
job_ads_dir = os.path.join(os.getcwd(), 'Job-Ads')
out_dir = os.path.join(os.getcwd(), 'Job-Text')
parser = etree.HTMLParser()

In [4]:
df = pd.read_csv('labor_discrim_clid_label.csv', index_col='clid')

In [23]:
#Storing text into text files
for index, file in enumerate(os.listdir(job_ads_dir)):
    html_doc = os.path.join(job_ads_dir, file)
    try:
        tree = etree.parse(html_doc, parser)
        body_text = tree.xpath('//*[@id="postingbody"]//text()')
        body_text_str = ' '.join(body_text)
        body_text_str = body_text_str.replace('QR Code Link to This Post', '')
        body_text_clean = body_text_str.rstrip().replace("\n","")
        file_name, ext = os.path.splitext(file)
        out_loc = os.path.join(out_dir, file_name)
        with open(out_loc, 'w') as f:
            f.write(body_text_clean)
    except Exception as e:
        print(e)

In [9]:
texts = OrderedDict()
for index, file in enumerate(os.listdir(out_dir)):
    path = os.path.join(out_dir, file)
    with open(path, 'r') as f:
        text = f.read()
        clid, ext = os.path.splitext(file)
        if int(clid) in df.index:
            texts[clid] = text
tokenizer = Tokenizer(nb_words=1000)
tokenizer.fit_on_texts(texts.values())
sequences = tokenizer.texts_to_sequences(texts.values())
word_index = tokenizer.word_index
print("Found {} unique tokens".format(len(word_index)))
data = pad_sequences(sequences, 200)



Found 21120 unique tokens


In [49]:
labels_indig=[]
labels_reverse=[]
labels_all=[]
for index, key in enumerate(texts.keys()):
    labels_indig.append(df.loc[int(key)]['indigenous_discrimination'])
    labels_reverse.append(df.loc[int(key)]['reverse_discrimination'])
    labels_all.append(df.loc[int(key)]['discrimination'])

In [50]:
print(data.shape)
print(labels.shape)
word_tokens = OrderedDict()
for index, key in enumerate(texts.keys()):
    word_tokens[key] = data[index]

(4623, 200)
(4623,)


In [76]:
df_tokens = pd.DataFrame.from_dict(word_tokens, orient='index')

In [77]:
df_tokens['indigenous_discrimination'] = labels_indig
df_tokens['reverse_discrimination'] = labels_reverse
df_tokens['discrimination'] = labels_all

In [79]:
df_tokens.to_csv('tokens_1000num_200pad.csv', index=True)

### Possess Dataframe of Tokenized Words

In [84]:
df_tokens = pd.read_csv('tokens_1000num_200pad.csv', index_col=0)

In [93]:
X = df_tokens[df_tokens.columns[:-3]].as_matrix()
y = df_tokens['indigenous_discrimination'].as_matrix()

In [94]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.33, random_state=0)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    X_train, X_test  = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

### Preparing Word Embeddings

In [10]:
embeddings_index_float = {}
embeddings_index_broken = {}
glove_model = os.path.join(os.getcwd(), 'Glove', 'glove.840B.300d.txt')
f = open(glove_model)
for line in f:
    values = line.split()
    word=values[0]
    try:
        coefs = np.asarray(values[1:]).astype(np.float)
        embeddings_index_float[word] = coefs
    except ValueError as e:
        embeddings_index_broken[word] = coefs
f.close()

In [24]:
embeddings_matrix = np.zeros((len(word_index)+1, 32 ))
for word, i in word_index.items():
    embedding_vector = embeddings_index_float.get(word)
    if embedding_vector is not None:
        embeddings_index_float[i] = embedding_vector

In [27]:
from keras.layers import Embedding

In [30]:
embedding_layer = Embedding(len(word_index)+1, 32, weights=[embeddings_matrix],
                            input_length=200, trainable=False)

In [61]:
model=Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [122]:
model.fit(X_train, y_train, epochs=3, batch_size=64, verbose=1, class_weight={0: 1, 1: 10})

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7efe88514ef0>

In [123]:
scores=model.evaluate(X_test, y_test, verbose=1)



In [124]:
predictions=model.predict(X_test)

In [130]:
y_pred = (predictions > 0.5).astype(int)

In [132]:
confusion_matrix(y_test, y_pred)

array([[1451,    0],
       [  75,    0]])

In [115]:
min(predictions)

array([ 0.05461046], dtype=float32)