In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import keras

Using TensorFlow backend.


In [2]:
labeled = pd.read_csv('train_bigger.csv', dtype={"from_2": object, "type": object, "category": object})

In [3]:
labeled.dropna(subset=['category'], inplace=True)

In [4]:
unlabeled = pd.read_csv('labeled_output.csv')

In [5]:
unlabeled.dropna(subset=['text'], inplace=True)

In [6]:
unlabeled.drop(columns=['text_type'], axis=1, inplace=True)

In [7]:
labeled['text_2']=labeled['from_2'].astype(str) + " " + labeled['text'].astype(str)
unlabeled['text_2']=unlabeled['from_2'].astype(str) + " " + unlabeled['text'].astype(str)

In [8]:
print("Categories:", np.unique(labeled['category']))
print("Number of unique words:", len(np.unique(np.hstack(labeled['text']))))

length = [len(i) for i in labeled['text']]
print("Average length:", np.mean(length))
print("max length:", np.max(length))
print("Standard Deviation:", round(np.std(length)))

Categories: ['conv' 'end' 'mcq' 'non mcq' 'setup' 'start']
Number of unique words: 492
Average length: 45.246688741721854
max length: 821
Standard Deviation: 74.0


In [9]:
labels=labeled['category']
unlabeled['category']=np.nan
val_labels=unlabeled['category']

In [10]:
docs=labeled['text_2']
val_docs=unlabeled['text_2']

In [11]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
labels=le.fit_transform(labels)

In [12]:
print(labels.shape, val_labels.shape)

(604,) (1084520,)


In [13]:
# num_classes=6
# labels = keras.utils.to_categorical(labels,num_classes)
# val_labels = keras.utils.to_categorical(val_labels, num_classes)

In [14]:
from keras.preprocessing import sequence
from keras.preprocessing import text

tokenizer = text.Tokenizer(num_words=492, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',)
tokenizer.fit_on_texts(docs)

In [15]:
docs = tokenizer.texts_to_sequences(docs)
val_docs = tokenizer.texts_to_sequences(val_docs)

In [16]:
docs = sequence.pad_sequences(docs, maxlen=821)
val_docs = sequence.pad_sequences(val_docs, maxlen=821)

In [17]:
docs.shape

(604, 821)

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(docs, labels, random_state =42, test_size=0.2)

In [19]:
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Embedding
# from keras.layers import Flatten

# model=Sequential()
# model.add(Embedding(input_dim=492,
#                     output_dim=128,
#                     input_length=821))
# model.add(Flatten())
# model.add(Dense(604, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(6, activation='softmax'))
# model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam',metrics=['acc'] )

In [20]:
# model.fit(x_train, y_train, epochs=20, verbose=0)

In [21]:
#loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [22]:
#print(f'test loss:{loss} \n test accuracy:{accuracy}')
f'test loss:0.7352383644127649' 
f'test accuracy:0.77685950216183'

'test accuracy:0.77685950216183'

In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV as RS
from scipy import stats
from keras.constraints import maxnorm

def create_model(dropout_rate=0.0, weight_constraint=0, hidden_layers=1, neurons=1, input_neurons=1, dropout_layers=1, embedding=1):
    model=Sequential()
    model.add(Embedding(input_dim=492, 
                       output_dim=embedding, 
                       input_length=821))
    model.add(Flatten())
    model.add(Dense(input_neurons, activation='relu'))
    for i in range(hidden_layers, dropout_layers):
        model.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model.add(Dropout(dropout_rate))
    model.add(Dense(6, activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['acc'])
    return model

In [24]:
params={'input_neurons': stats.randint(1,128),
        'neurons': stats.randint(1,128),
        'hidden_layers': stats.randint(1,16),
        'dropout_layers': stats.randint(1,16),
        'dropout_rate': stats.uniform(0,0.9),
        'weight_constraint': stats.uniform(1,5),
        'embedding': stats.randint(1,256)
       }
n_iter=128

In [25]:
#model = KerasClassifier(build_fn=create_model, verbose=0, shuffle=True)

In [26]:
#rand = RS(estimator=model, param_distributions=params, n_jobs=-1, cv=4, n_iter=n_iter)

In [27]:
#rand_search = rand.fit(x_train, y_train)

In [28]:
# Utility function to report best scores- from kaggle:https://www.kaggle.com/ksjpswaroop/parameter-tuning-rf-randomized-search
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [30]:
#report(rand_search.cv_results_)

In [31]:
def create_model_2(dropout_rate=0.216, weight_constraint=1.1489, hidden_layers=11, neurons=64, input_neurons=98, dropout_layers=5, embedding=168, batch_size=1, epochs=1):
    model2=Sequential()
    model2.add(Embedding(input_dim=492, 
                       output_dim=embedding, 
                       input_length=821))
    model2.add(Flatten())
    model2.add(Dense(input_neurons, activation='relu'))
    for i in range(hidden_layers, dropout_layers):
        model2.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model2.add(Dropout(dropout_rate))
    model2.add(Dense(6, activation='softmax'))
    
    model2.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['acc'])
    return model2

In [32]:
params2={'batch_size': stats.randint(1,128),
         'epochs': stats.randint(1,64)}
n_iter=128

In [33]:
model2 = KerasClassifier(build_fn=create_model_2, verbose=0, shuffle=True)

In [34]:
rand2 = RS(estimator=model2, param_distributions=params2, n_jobs=-1, cv=4, n_iter=n_iter)

In [None]:
rand_search2 = rand2.fit(x_train, y_train)

In [None]:
report(rand_search2.cv_results_)

In [None]:
def create_model_3(dropout_rate=0.0747, weight_constraint=5.926, hidden_layers=15, neurons=1, input_neurons=31, dropout_layers=12, embedding=19):
    model3=Sequential()
    model3.add(Embedding(input_dim=492, 
                       output_dim=embedding, 
                       input_length=821))
    model3.add(Flatten())
    model3.add(Dense(input_neurons, activation='relu'))
    for i in range(hidden_layers, dropout_layers):
        model3.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model3.add(Dropout(dropout_rate))
    model3.add(Dense(6, activation='softmax'))
    
    model3.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['acc'])
    return model3

In [None]:
model3 = KerasClassifier(build_fn=create_model_3, verbose=0, shuffle=True)

In [None]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

In [None]:
history = model.fit(
            x_train,
            y_train,
            batch_size=54, 
            epochs=62,
            callbacks=callbacks,
            validation_data=(x_test, y_test),
            verbose=2,  # Logs once per epoch.
            )

In [None]:
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_acc'][-1], loss=history['val_loss'][-1]))