## read in data

<i>format as test and train

In [23]:
import pandas as pd
import numpy as np
import sklearn as sk
import keras

In [24]:
train = pd.read_csv('train_small.csv', dtype={"from_2": object, "type": object, "category": object})

In [25]:
train.dropna(subset=['type'], inplace=True)

In [26]:
test = pd.read_csv('labeled_output.csv')

In [27]:
test.dropna(subset=['text'], inplace=True)

In [28]:
test.drop(columns=['text_type'], axis=1, inplace=True)

In [29]:
test['category']=np.nan

In [30]:
X = train['text']
y = train['category']

In [31]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 2, test_size=0.2)

## prepare data

<i> label encode and one-hot encode categories

In [32]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.fit_transform(y_test)

In [33]:
num_classes=6
y_train = keras.utils.to_categorical(y_train,num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

<i> use sklearn's vectorizer to format text data at word level

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vec = TfidfVectorizer(analyzer='word',
                         min_df=1,
                         strip_accents='unicode',
                         token_pattern=r'\w{1,}',
                         ngram_range=(2,3),
                         )
x_train_vec = tf_vec.fit_transform(x_train)
x_test_vec = tf_vec.transform(x_test)
print (x_train_vec.shape)
print (x_test_vec.shape)

(207, 2277)
(52, 2277)


## hyperparameter training

In [35]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import RandomizedSearchCV as RS
from scipy import stats
from scipy.stats import randint
from scipy.stats import uniform

In [36]:
def create_model(neurons=1, hidden_layers=1):
    model = Sequential()
    model.add(Dense(16, activation='relu', input_shape=(2277,)))
    for i in np.arange(hidden_layers):
        model.add(Dense(neurons, activation='relu'))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=
    ['accuracy'])
    return model

In [37]:
model = KerasClassifier(build_fn=create_model, verbose=0, shuffle=True)

In [38]:
params = {'neurons': stats.randint(2, 128),
         'hidden_layers': stats.uniform(1, 10)
         }
n_iter = 20

In [39]:
grid = RS(estimator=model, param_distributions=params, n_jobs=-1, cv=4, n_iter=n_iter)

In [None]:
grid_result = grid.fit(x_train_vec, y_train)

In [None]:
print(f'best score:{grid_result.best_score_} \n best params:{grid_result.best_params_}')

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, verbose=0)

from sklearn.model_selection import GridSearchCV

batch_size= [4,8,16,32,64,128,256]
epochs=[4,8,16,32,64,128]

param_grid = dict(batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)

model.fit(x=x_train_vec, y=y_train, batch_size=?, epochs=?)
#test_loss, test_acc = model.evaluate(x_test_vec, y_test)
#test_acc

def create_model_2(optimizer='Adam'):
    inputs = layers.Input(shape=(2163,))
    hidden_1 = layers.Dense(units=32, activation='relu')(inputs)
    dropout1 = layers.Dropout(0.5)(hidden_1)
    hidden_2 = layers.Dense(units=32, activation='relu')(hidden_1)
    dropout_2 = layers.Dropout(0.5)(hidden_2)
    hidden_3 = layers.Dense(units=32, activation='relu')(hidden_2)
    dropout_3 = layers.Dropout(0.5)(hidden_3)
    hidden_4 = layers.Dense(units=32, activation='relu')(hidden_3)
    dropout_4 = layers.Dropout(0.5)(hidden_3)
    hidden_5 = layers.Dense(units=32, activation='relu')(hidden_4)
    dropout_5 = layers.Dropout(0.5)(hidden_4)
    outputs = layers.Dense(6, activation='softmax')(hidden_5)

    model2= models.Model(inputs=inputs, outputs=outputs)
    #model.summary()
    model2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=optimizer)
    return model2

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model_2, verbose=0, batch_size=32, epochs=64)

from sklearn.model_selection import GridSearchCV

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)