In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, LSTM, Flatten
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy

In [2]:
# Freeze the randomness
from numpy.random import seed
seed(1337)
import tensorflow as tf
tf.random.set_seed(42)

In [3]:
globalStrategy = "retrain"

In [4]:
def scoring(data, predict):
    f1 = []
    jaccard = []
    threshold = []
    for i in np.arange(0.30, 0.99, 0.01):
        predict_ = np.where(predict >= i, 1, 0)
        f1_micro = metrics.f1_score(data, predict_, average = 'micro')
        jaccard_micro = metrics.jaccard_score(data, predict_, average = 'micro')
        f1.append(f1_micro)
        jaccard.append(jaccard_micro)
        threshold.append(i)
    
    results = pd.DataFrame()
    results['Threshold'] = threshold
    results['F1_micro'] = f1
    results['Jaccard_micro'] = jaccard
    results = results[results['F1_micro'] == results['F1_micro'].max()]
    return results

In [5]:
data = pd.read_csv('data_cleaned.csv',
                   converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})
#data = pd.read_csv('data_cleaned2.csv',
#                   converters={"preprocessedTags": lambda x: x.strip("[]").replace("'","").split(", ")})

#data = data[['desc', 'unstemmed_desc', 'preprocessedTags', 'Tag1']].dropna()
data

Unnamed: 0,Id,desc,unstemmed_desc,preprocessedTags,Tag1
0,591865,vs cakephp vs zend vs cakephp vs zend cakephp ...,vs cakephp vs zend vs cakephp vs zend cakephp ...,[php],php
1,591892,tool generat mock data tool generat mock data ...,tools generating mock data tools generating mo...,[testing],testing
2,41441462,laravel use statement non name cach effect lar...,laravel use statement non name cache effect la...,"[php, laravel]",php
3,9552725,add client authent add client authent server r...,add client authentication add client authentic...,[java],java
4,33014984,variabl error variabl error system namespac cl...,variable error variable error system namespace...,[c#],c#
...,...,...,...,...,...
47050,18363295,jqueri event child jqueri event child anchor w...,jquery event child jquery event child anchor w...,"[javascript, jquery]",javascript
47051,18363308,best way remov word best way remov word strong...,best way remove words best way remove words st...,"[c#, winforms]",c#
47052,8625692,array output array output includ main 3 b 6 pr...,array output array output include main 3 b 6 p...,[c],c
47053,19723361,ok pointer invalid locat use ok pointer invali...,ok pointer invalid location use ok pointer inv...,[c],c


In [6]:
data = data[data.groupby('Tag1').Tag1.transform(len) > 200]

In [7]:
data = data.groupby('Tag1')
data = data.apply(lambda x: x.sample(data.size().min())).reset_index(drop=True)

In [8]:
data

Unnamed: 0,Id,desc,unstemmed_desc,preprocessedTags,Tag1
0,2214914,net api googl talk net api googl talk look net...,net api google talk net api google talk lookin...,"[.net, api, open-source]",.net
1,371851,websit web app architectur advic websit web ap...,website web app architecture advice website we...,"[.net, html, css]",.net
2,2184884,xna game tutori xna game tutori code want lear...,xna game tutorial xna game tutorial coding wan...,[.net],.net
3,3516203,script languag net base ide script languag net...,scripting language net based ide scripting lan...,[.net],.net
4,166744,best linux distribut run mono best linux distr...,best linux distribution running mono best linu...,"[.net, linux]",.net
...,...,...,...,...,...
6261,25429904,file file creat filesystem f destin anoth e so...,file file created filesystem f destination ano...,[windows],windows
6262,33901173,remov certain charact certain file remov certa...,removing certain characters certain file remov...,"[windows, batch-file, command-line]",windows
6263,35541334,oracl mac virtualbox window 10 work oracl mac ...,oracle mac virtualbox windows 10 working oracl...,"[windows, oracle, macos]",windows
6264,8412792,smallest partit smallest partit need creat sma...,smallest partition smallest partition need cre...,"[windows, linux]",windows


In [9]:
x = data['desc']
x_unstemmed = data['unstemmed_desc']
y = data['preprocessedTags']
# y_tag = data['Tag1']
mb = MultiLabelBinarizer()
y_encoded = mb.fit_transform(y)
# y_encoded = mb.fit_transform(y_tag)

In [10]:
# x_train, x_test split
x_train, x_test, y_train, y_test = train_test_split(x_unstemmed,
                                                    y_encoded,
                                                    test_size=0.2,
                                                    random_state=42)

# Tokenizer + Bag of Words

In [11]:
from tensorflow.keras.preprocessing import text

max_length = 500

tokenizer_t = text.Tokenizer(num_words=max_length)
tokenizer_t.fit_on_texts(x_train)

bag_of_words_train = tokenizer_t.texts_to_matrix(x_train)
bag_of_words_test = tokenizer_t.texts_to_matrix(x_test)

In [12]:
# Define the model
xInput = Input(shape=(max_length))
x_ = Dense(500, activation='relu')(xInput)
x_ = Dense(500, activation='relu')(x_)
output = Dense(len(y_encoded[0]), activation='sigmoid')(x_)

model_toke = Model(inputs=xInput, outputs=output, name='Toke')

# Compile the model
model_toke.compile(loss=CategoricalCrossentropy(from_logits=True, label_smoothing=0.2),
                   optimizer=Adam(learning_rate=0.000001),
                   metrics=[CategoricalAccuracy('accuracy')])

In [13]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5,
                                         mode='max',
                                         restore_best_weights=True)

In [14]:
model_toke.summary()

Model: "Toke"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 500)]             0         
_________________________________________________________________
dense (Dense)                (None, 500)               250500    
_________________________________________________________________
dense_1 (Dense)              (None, 500)               250500    
_________________________________________________________________
dense_2 (Dense)              (None, 205)               102705    
Total params: 603,705
Trainable params: 603,705
Non-trainable params: 0
_________________________________________________________________


In [15]:
epochs = 200
batch_size = 64

In [16]:
# Load the model, if does not exist then train one
if globalStrategy == 'retrain' or globalStrategy == 'retrainToke':
    epochs = epochs
    batch_size=batch_size
    history = model_toke.fit(bag_of_words_train, y_train,
                             epochs=epochs,
                             validation_split=0.1,
                             callbacks=[callback],
                             verbose=1)

    model_toke.save('/home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/')
    model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/')

else:
    try:
        model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/')
    except OSError:
        epochs = epochs
        batch_size = batch_size
        history = model_toke.fit(bag_of_words_train, y_train,
                                 epochs=epochs,
                                 validation_split=0.1,
                                 callbacks=[callback],
                                 verbose=1)

        model_toke.save('/home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/')
        model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200


Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
INFO:tensorflow:Assets written to: /home/mlmaster/Code/Ing_ml_P7/SimpleTokenizer/assets


In [17]:
model_toke.evaluate(bag_of_words_test, y_test, verbose=1)



[7.610741138458252, 0.13795852661132812]

In [18]:
predict = model_toke.predict(bag_of_words_test)
#predict_ = np.where(predict > 0.98, 1, 0)
scoring(y_test, predict)

Unnamed: 0,Threshold,F1_micro,Jaccard_micro
51,0.81,0.106168,0.05606


In [19]:
predict[0]

array([0.74313533, 0.3654136 , 0.5684666 , 0.5244302 , 0.65976286,
       0.41830975, 0.437737  , 0.36241424, 0.5013205 , 0.4975043 ,
       0.52363724, 0.39109153, 0.40014058, 0.612706  , 0.7281524 ,
       0.5659703 , 0.42216212, 0.36713135, 0.74554884, 0.45231068,
       0.35666895, 0.7056698 , 0.6885657 , 0.70913434, 0.36504552,
       0.553618  , 0.5204188 , 0.54793274, 0.51010895, 0.42141083,
       0.45259464, 0.48266152, 0.45075354, 0.47270927, 0.74714434,
       0.46310806, 0.4334555 , 0.5502271 , 0.537752  , 0.536212  ,
       0.5222036 , 0.5166796 , 0.45106396, 0.47247455, 0.44078785,
       0.40066326, 0.44969013, 0.5308471 , 0.4176167 , 0.5237626 ,
       0.441598  , 0.4753249 , 0.3969262 , 0.38052827, 0.40066883,
       0.42699608, 0.43161687, 0.53866893, 0.54916024, 0.4401371 ,
       0.3632746 , 0.4749255 , 0.4986868 , 0.44291022, 0.40933096,
       0.5051384 , 0.52244663, 0.4206973 , 0.7399657 , 0.5653693 ,
       0.38986912, 0.39922598, 0.4645747 , 0.7615176 , 0.42001

# GPT-2 Tokenizer

In [20]:
# Load tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

# set token for padding
tokenizer.pad_token = 0

# Load model
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')

All model checkpoint layers were used when initializing TFOpenAIGPTModel.

All the layers of TFOpenAIGPTModel were initialized from the model checkpoint at openai-gpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFOpenAIGPTModel for predictions without further training.


In [21]:
# Ready output data for the model
max_length=250

x_train_toke = tokenizer(text=x_train.to_list(),
                         add_special_tokens=True,
                         max_length=max_length,
                         truncation=True,
                         padding=True, 
                         return_tensors='tf',
                         return_token_type_ids=False,
                         return_attention_mask=False,
                         verbose=True)

x_test_toke = tokenizer(text=x_test.to_list(),
                        add_special_tokens=True,
                        max_length=max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='tf',
                        return_token_type_ids=False,
                        return_attention_mask=False,
                        verbose=True)

x_train_toke = x_train_toke['input_ids']
x_test_toke = x_test_toke['input_ids']

In [30]:
# Define the model
xInput = Input(shape=(max_length))
x_ = Dense(500, activation='relu')(xInput)
x_ = Dense(500, activation='relu')(x_)
output = Dense(len(y_encoded[0]), activation='sigmoid')(x_)

model_toke = Model(inputs=xInput, outputs=output, name='Toke')

# Compile the model
model_toke.compile(loss=CategoricalCrossentropy(from_logits=True, label_smoothing=0.1),
                   optimizer=Adam(learning_rate=0.0000001),
                   metrics=[CategoricalAccuracy('accuracy')])

In [31]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5,
                                         mode='max',
                                         restore_best_weights=True)

In [32]:
model_toke.summary()

Model: "Toke"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
dense_6 (Dense)              (None, 500)               125500    
_________________________________________________________________
dense_7 (Dense)              (None, 500)               250500    
_________________________________________________________________
dense_8 (Dense)              (None, 205)               102705    
Total params: 478,705
Trainable params: 478,705
Non-trainable params: 0
_________________________________________________________________


In [33]:
epochs = 200
batch_size = 64

In [34]:
# Load the model, if does not exist then train one
if globalStrategy == 'retrain' or globalStrategy == 'retrainToke':
    epochs = epochs
    batch_size=batch_size
    history = model_toke.fit(x_train_toke, y_train,
                             epochs=epochs,
                             validation_split=0.1,
                             callbacks=[callback],
                             verbose=1)

    model_toke.save('/home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/')
    model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/')

else:
    try:
        model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/')
    except OSError:
        epochs = epochs
        batch_size = batch_size
        history = model_toke.fit(x_train_toke, y_train,
                                 epochs=epochs,
                                 validation_split=0.1,
                                 callbacks=[callback],
                                 verbose=1)

        model_toke.save('/home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/')
        model_toke = tf.keras.models.load_model('/home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
INFO:tensorflow:Assets written to: /home/mlmaster/Code/Ing_ml_P7/GPTTokenizer/assets


In [35]:
model_toke.evaluate(x_test_toke, y_test, verbose=1)



[11520.4560546875, 0.007177033461630344]

In [36]:
predict = model_toke.predict(x_test_toke)
#predict_ = np.where(predict > 0.98, 1, 0)
scoring(y_test, predict)

Unnamed: 0,Threshold,F1_micro,Jaccard_micro
67,0.97,0.01942,0.009805


In [37]:
predict[0]

array([1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 0.0000000e+00, 1.0000000e+00, 0.0000000e