In [1]:
import pandas as pd
from numpy import asarray
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf

In [2]:
def get_keras_model():
    """Define the model."""
    model = Sequential()
    model.add(Dense(128, input_shape=[512 ,], activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(64 ,activation='relu' ,kernel_regularizer=tf.keras.regularizers.L1(0.01),
                    activity_regularizer=tf.keras.regularizers.L2(0.01)))
    model.add(Dense(6, activation='softmax'))

    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    model.summary()
    return model

In [3]:
data =pd.read_csv("wikidata.csv" ,usecols=["questions" ,"types"])
categories =data["types"]
print(data.head())

                                           questions  types
0  what is the maximum age of stomach cancer pati...      1
1  get me the average number of stomach cancer de...      5
2      how many times is the fuel propulsion is cng?      3
3               how many times is the model ge40lfr?      3
4  how many times is the fleet series (quantity) ...      3


In [4]:
x_train, x_test, y_train ,y_test =train_test_split(data["questions"], categories, shuffle=True)

In [5]:
import tensorflow_hub as hub

In [7]:
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [8]:
def get_embeddings(x):
    embeddings = embed(x)
    return asarray(embeddings)


In [9]:
train_encodings = get_embeddings(x_train.to_list())
test_encodings = get_embeddings(x_test.tolist())

In [10]:
y_train = asarray(y_train, dtype="float32")
y_test = asarray(y_test, dtype="float32")


In [11]:
model = get_keras_model()
print(train_encodings.shape)
model.fit(train_encodings, y_train, epochs=50, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               65664     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 6)                 390       
                                                                 
Total params: 74,310
Trainable params: 74,310
Non-trainable params: 0
_________________________________________________________________
(16243, 512)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


<keras.callbacks.History at 0x281aae6bcd0>

In [12]:
model.save("Question_Classifier.h5")
score, acc = model.evaluate(test_encodings, y_test)

