## read in data

<i>format as test and train

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import keras

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_bigger.csv', dtype={"from_2": object, "type": object, "category": object})

In [3]:
train.dropna(subset=['category'], inplace=True)

In [4]:
test = pd.read_csv('labeled_output.csv')

In [5]:
test.dropna(subset=['text'], inplace=True)

In [6]:
test.drop(columns=['text_type'], axis=1, inplace=True)

In [7]:
test['category']=np.nan

In [8]:
X = train['text']
y = train['category']

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 2, test_size=0.2)

## prepare data

<i> label encode and one-hot encode categories

In [10]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.fit_transform(y_test)

In [11]:
num_classes=6
y_train = keras.utils.to_categorical(y_train,num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

<i> use sklearn's vectorizer to format text data at word level

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vec = TfidfVectorizer(analyzer='word',
                         min_df=2,
                         strip_accents='unicode',
                         token_pattern=r'\w{1,}',
                         ngram_range=(2,3)
                         )
x_train_vec = tf_vec.fit_transform(x_train)
x_test_vec = tf_vec.transform(x_test)
print (x_train_vec.shape)
print (x_test_vec.shape)

(483, 673)
(121, 673)


## hyperparameter training

In [13]:
# Utility function to report best scores- from kaggle:https://www.kaggle.com/ksjpswaroop/parameter-tuning-rf-randomized-search
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [14]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import RandomizedSearchCV as RS
from scipy import stats
from scipy.stats import randint
from scipy.stats import uniform

In [15]:
from keras.constraints import maxnorm
from keras.layers import Dropout

def create_model_2(dropout_rate=0.0, weight_constraint=0, hidden_layers=1, neurons=1):
    model2 = Sequential()
    model2.add(Dense(neurons, activation='relu', input_shape=(673,)))
    for i in np.arange(hidden_layers):
        model2.add(Dense(neurons, activation='relu'))
        model2.add(Dropout(dropout_rate))
    model2.add(Dense(6, activation='softmax'))
    model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model2

In [16]:
model2 = KerasClassifier(build_fn=create_model_2, verbose=0, shuffle=True)

In [17]:
params = {'neurons': stats.randint(2, 52),
         'hidden_layers': stats.randint(1, 5),
         'dropout_rate': stats.uniform(0.0, 0.9),
         'weight_constraint': stats.uniform(1, 5)
         }
n_iter = 64

In [18]:
rand2 = RS(estimator=model2, param_distributions=params, n_jobs=-1, cv=4, n_iter=n_iter)

In [19]:
rand_search2 = rand2.fit(x_train_vec, y_train)

In [20]:
report(rand_search2.cv_results_)

Model with rank: 1
Mean validation score: 0.547 (std: 0.069)
Parameters: {'dropout_rate': 0.17448185249263476, 'hidden_layers': 1, 'neurons': 47, 'weight_constraint': 4.91121310218517}

Model with rank: 2
Mean validation score: 0.536 (std: 0.072)
Parameters: {'dropout_rate': 0.3721020759554231, 'hidden_layers': 2, 'neurons': 19, 'weight_constraint': 5.643586364015265}

Model with rank: 3
Mean validation score: 0.522 (std: 0.084)
Parameters: {'dropout_rate': 0.24349017520726524, 'hidden_layers': 1, 'neurons': 33, 'weight_constraint': 2.928270933187144}

Model with rank: 4
Mean validation score: 0.520 (std: 0.068)
Parameters: {'dropout_rate': 0.08400763927433745, 'hidden_layers': 2, 'neurons': 27, 'weight_constraint': 4.428423941867987}

Model with rank: 5
Mean validation score: 0.520 (std: 0.078)
Parameters: {'dropout_rate': 0.24344114994887692, 'hidden_layers': 1, 'neurons': 50, 'weight_constraint': 1.8740943874140523}



from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, verbose=0)

from sklearn.model_selection import GridSearchCV

batch_size= [4,8,16,32,64,128,256]
epochs=[4,8,16,32,64,128]

param_grid = dict(batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)

model.fit(x=x_train_vec, y=y_train, batch_size=?, epochs=?)
#test_loss, test_acc = model.evaluate(x_test_vec, y_test)
#test_acc

def create_model_2(optimizer='Adam'):
    inputs = layers.Input(shape=(2163,))
    hidden_1 = layers.Dense(units=32, activation='relu')(inputs)
    dropout1 = layers.Dropout(0.5)(hidden_1)
    hidden_2 = layers.Dense(units=32, activation='relu')(hidden_1)
    dropout_2 = layers.Dropout(0.5)(hidden_2)
    hidden_3 = layers.Dense(units=32, activation='relu')(hidden_2)
    dropout_3 = layers.Dropout(0.5)(hidden_3)
    hidden_4 = layers.Dense(units=32, activation='relu')(hidden_3)
    dropout_4 = layers.Dropout(0.5)(hidden_3)
    hidden_5 = layers.Dense(units=32, activation='relu')(hidden_4)
    dropout_5 = layers.Dropout(0.5)(hidden_4)
    outputs = layers.Dense(6, activation='softmax')(hidden_5)

    model2= models.Model(inputs=inputs, outputs=outputs)
    #model.summary()
    model2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=optimizer)
    return model2

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model_2, verbose=0, batch_size=32, epochs=64)

from sklearn.model_selection import GridSearchCV

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)