## Importing libraries 

In [2]:
import numpy as np 
import pandas as pd 
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping


Using TensorFlow backend.


## Read CSV file

In [3]:
data = pd.read_csv('jobs.csv')
data.head()

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT


## Checking the numbers of each class of industry 

In [4]:
# Counting industry.
data.groupby('industry').count()

Unnamed: 0_level_0,job title
industry,Unnamed: 1_level_1
Accountancy,374
Education,1435
IT,4746
Marketing,2031


# Balancing the dataset
### After checking the dataset, it seems like it's not balanced between the four classes. 
### so I undersample all classes to the least one (374 sample per each industry)

In [5]:
num_of_categories = 374

shuffled = data.reindex(np.random.permutation(data.index))

Accountancy = shuffled[shuffled['industry'] == 'Accountancy'][:num_of_categories]

Education = shuffled[shuffled['industry'] == 'Education'][:num_of_categories]

IT = shuffled[shuffled['industry'] == 'IT'][:num_of_categories]

Marketing = shuffled[shuffled['industry'] == 'Marketing'][:num_of_categories]

concated = pd.concat([Accountancy,Education,IT,Marketing], ignore_index=True)

#Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index))

In [6]:
concated

Unnamed: 0,job title,industry
304,trainee accountant assistant,Accountancy
1052,head of business centre,IT
1266,sales and marketing assistant,Marketing
1264,marketing assistant,Marketing
585,trainee mfl teacher,Education
641,ks2 teaching assistant,Education
1379,pr executive,Marketing
658,math teacher,Education
1029,support analyst,IT
1133,statistical marketing analyst,Marketing


# One hot encode the four classes 
## Created a new column (Label) to store the values of one hot encoded classes

In [7]:
concated.loc[concated['industry'] == 'Accountancy', 'LABEL'] = 0
concated.loc[concated['industry'] == 'Education', 'LABEL'] = 1
concated.loc[concated['industry'] == 'IT', 'LABEL'] = 2
concated.loc[concated['industry'] == 'Marketing', 'LABEL'] = 3
print(concated['LABEL'][:10])
labels = to_categorical(concated['LABEL'], num_classes=4)
print(labels[:10])
concated.drop(['industry'], axis=1)
concated    

304     0.0
1052    2.0
1266    3.0
1264    3.0
585     1.0
641     1.0
1379    3.0
658     1.0
1029    2.0
1133    3.0
Name: LABEL, dtype: float64
[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


Unnamed: 0,job title,industry,LABEL
304,trainee accountant assistant,Accountancy,0.0
1052,head of business centre,IT,2.0
1266,sales and marketing assistant,Marketing,3.0
1264,marketing assistant,Marketing,3.0
585,trainee mfl teacher,Education,1.0
641,ks2 teaching assistant,Education,1.0
1379,pr executive,Marketing,3.0
658,math teacher,Education,1.0
1029,support analyst,IT,2.0
1133,statistical marketing analyst,Marketing,3.0


# Coverting text to numbers  

In [8]:
n_most_common_words = 500
max_len = 7
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(concated['job title'].values)
sequences = tokenizer.texts_to_sequences(concated['job title'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)
print(X)

Found 1059 unique tokens.
[[  0   0   0 ...  13  12   1]
 [  0   0   0 ...  41  23 174]
 [  0   0   0 ...  22   2   1]
 ...
 [  0   0   0 ... 103  47   1]
 [ 40  32 187 ... 154  67 197]
 [  0   0   0 ...  10  46  70]]


# Spliting dataset to train and test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.2, random_state=42)

# Setting Hyperparameters 

In [10]:
epochs = 55
emb_dim = 128
batch_size = 100
labels[:4]

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

# Building the NN

In [11]:
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

((1196, 7), (1196, 4), (300, 7), (300, 4))
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 7, 128)            64000     
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 7, 128)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 113,668
Trainable params: 113,668
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf

# Testing accuracy

In [12]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.254
  Accuracy: 0.920


# Prediect job titles by putting the job title between " " in txt

In [None]:
txt = ["play"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
label = ['Accountancy', 'Education', 'IT', 'Marketing']
print(pred, label[np.argmax(pred)])