## read in data

<i>format as test and train

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import keras

Using TensorFlow backend.


In [2]:
train = pd.read_csv('train_bigger.csv', dtype={"from_2": object, "type": object, "category": object})

In [3]:
train.dropna(subset=['category'], inplace=True)

In [4]:
test = pd.read_csv('labeled_output.csv')

In [5]:
test.dropna(subset=['text'], inplace=True)

In [6]:
test.drop(columns=['text_type'], axis=1, inplace=True)

In [7]:
test['category']=np.nan

In [8]:
train['text_2']=train['from_2'].astype(str) + " " + train['text'].astype(str)

In [9]:
train.head()

Unnamed: 0,channel_type,channel_name,id,date,from,text,from_2,type,category,text_2
0,personal_chat,TZ155 Nontobeko Mthembu,226915,2018-03-22T17:59:29,Setup (Master),Sawubona Jessie 😊. ;;We can see that you are e...,setup,noncontent,setup,setup Sawubona Jessie 😊. ;;We can see that you...
1,private_group,19:00 (10/07) Thabiso (2)🏆3️⃣,376217,2018-04-03T19:01:47,ThishaBot,Any questions?,bot,noncontent,conv,bot Any questions?
2,private_group,20:00 (11/07) Irfaan (2)🏆3️⃣,873757,2018-05-15T20:52:52,Irfaan Moolla,Today,student,noncontent,conv,student Today
3,private_group,13:00 (10/07) Fatima (3)🏆3️⃣,1368973,2018-07-02T19:05:13,TZ Simphiwe Mfaba,Kulungile 💪,tutor,noncontent,conv,tutor Kulungile 💪
4,private_group,20:00 (09/07) Jenna (1)🏆4⃣,1234119,2018-06-18T20:12:31,ThishaBot,"Surprisingly, the patient seems satisfied with...",bot,content,non mcq,"bot Surprisingly, the patient seems satisfied ..."


In [10]:
X = train['text_2']
y = train['category']

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state =12, test_size=0.2)

## prepare data

<i> label encode and one-hot encode categories

In [12]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.fit_transform(y_test)

In [14]:
print(y_train.shape, y_test.shape)

(483,) (121,)


In [12]:
# num_classes=6
# y_train = keras.utils.to_categorical(y_train,num_classes)
# y_test = keras.utils.to_categorical(y_test, num_classes)

<i> use sklearn's vectorizer to format text data at word level

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vec = TfidfVectorizer(analyzer='word',
                         min_df=2,
                         strip_accents='unicode',
                         token_pattern=r'\w{1,}',
                         ngram_range=(2,3)
                         )
x_train_vec = tf_vec.fit_transform(x_train)
x_test_vec = tf_vec.transform(x_test)
print (x_train_vec.shape)
print (x_test_vec.shape)

(483, 789)
(121, 789)


## hyperparameter training

In [14]:
# Utility function to report best scores- from kaggle:https://www.kaggle.com/ksjpswaroop/parameter-tuning-rf-randomized-search
def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [15]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import RandomizedSearchCV as RS
from scipy import stats
from scipy.stats import randint
from scipy.stats import uniform

In [21]:
from keras.constraints import maxnorm
from keras.layers import Dropout

def create_model_2(dropout_rate=0.0, weight_constraint=0, hidden_layers=1, neurons=1, input_neurons=1, dropout_layers=1):
    model2 = Sequential()
    model2.add(Dense(input_neurons, activation='relu', input_shape=(789,)))
    for i in np.arange(hidden_layers, dropout_layers):
        model2.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
        model2.add(Dropout(dropout_rate))
    model2.add(Dense(6, activation='softmax'))
    model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model2

In [22]:
model2 = KerasClassifier(build_fn=create_model_2, verbose=0, shuffle=True)

In [23]:
params = {'input_neurons': stats.randint(2,256),
         'neurons': stats.randint(2, 64),
         'hidden_layers': stats.randint(1, 5),
          'dropout_layers': stats.randint(1,5),
         'dropout_rate': stats.uniform(0.0, 0.9),
         'weight_constraint': stats.uniform(1, 5)
         }
n_iter = 64

In [24]:
rand2 = RS(estimator=model2, param_distributions=params, n_jobs=-1, cv=4, n_iter=n_iter)

In [25]:
rand_search2 = rand2.fit(x_train_vec, y_train)

In [26]:
report(rand_search2.cv_results_)

Model with rank: 1
Mean validation score: 0.609 (std: 0.034)
Parameters: {'dropout_layers': 1, 'dropout_rate': 0.08782427604822979, 'hidden_layers': 4, 'input_neurons': 177, 'neurons': 6, 'weight_constraint': 2.786347683448474}

Model with rank: 2
Mean validation score: 0.596 (std: 0.008)
Parameters: {'dropout_layers': 4, 'dropout_rate': 0.0320401140603887, 'hidden_layers': 4, 'input_neurons': 197, 'neurons': 21, 'weight_constraint': 5.131840432374133}

Model with rank: 3
Mean validation score: 0.592 (std: 0.045)
Parameters: {'dropout_layers': 4, 'dropout_rate': 0.26203805833644134, 'hidden_layers': 4, 'input_neurons': 173, 'neurons': 42, 'weight_constraint': 4.372603740524596}

Model with rank: 4
Mean validation score: 0.592 (std: 0.022)
Parameters: {'dropout_layers': 1, 'dropout_rate': 0.2900175512061454, 'hidden_layers': 4, 'input_neurons': 171, 'neurons': 36, 'weight_constraint': 5.613096815127511}

Model with rank: 5
Mean validation score: 0.582 (std: 0.008)
Parameters: {'dropout_

In [None]:
# hidden_layers=1
# input_neurons=28
# neurons=47
# weight_constraint=1
# dropout_rate=0.3357

# def create_model_3(dropout_rate=0.3357, weight_constraint=1, neurons=47, input_neurons=28,hidden_layers=1):
#     model3 = Sequential()
#     model3.add(Dense(input_neurons, activation='relu', input_shape=(685,)))
#     for i in np.arange():
#         model3.add(Dense(neurons, activation='relu', kernel_constraint=maxnorm(weight_constraint)))
#         model3.add(Dropout(dropout_rate))
#     model3.add(Dense(6, activation='softmax'))
#     model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model3

In [None]:
model3 = KerasClassifier(build_fn=create_model_3, verbose=0, shuffle=True)

In [None]:
params={'batch_size': stats.randint(1,256),
        'epochs': stats.randint(1,256)}
n_iter = 64

In [None]:
rand3 = RS(estimator=model3, param_distributions=params, n_jobs=-1, cv=4, n_iter=n_iter)

In [None]:
rand_search3 = rand3.fit(x_train_vec, y_train)

In [None]:
report(rand_search3.cv_results_)

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model, verbose=0)

from sklearn.model_selection import GridSearchCV

batch_size= [4,8,16,32,64,128,256]
epochs=[4,8,16,32,64,128]

param_grid = dict(batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)

model.fit(x=x_train_vec, y=y_train, batch_size=?, epochs=?)
#test_loss, test_acc = model.evaluate(x_test_vec, y_test)
#test_acc

def create_model_2(optimizer='Adam'):
    inputs = layers.Input(shape=(2163,))
    hidden_1 = layers.Dense(units=32, activation='relu')(inputs)
    dropout1 = layers.Dropout(0.5)(hidden_1)
    hidden_2 = layers.Dense(units=32, activation='relu')(hidden_1)
    dropout_2 = layers.Dropout(0.5)(hidden_2)
    hidden_3 = layers.Dense(units=32, activation='relu')(hidden_2)
    dropout_3 = layers.Dropout(0.5)(hidden_3)
    hidden_4 = layers.Dense(units=32, activation='relu')(hidden_3)
    dropout_4 = layers.Dropout(0.5)(hidden_3)
    hidden_5 = layers.Dense(units=32, activation='relu')(hidden_4)
    dropout_5 = layers.Dropout(0.5)(hidden_4)
    outputs = layers.Dense(6, activation='softmax')(hidden_5)

    model2= models.Model(inputs=inputs, outputs=outputs)
    #model.summary()
    model2.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=optimizer)
    return model2

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=create_model_2, verbose=0, batch_size=32, epochs=64)

from sklearn.model_selection import GridSearchCV

optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)

grid_pred = grid.fit(x_train_vec, y_train);

print(grid_pred.best_score_)
print(grid_pred.best_params_)