In [0]:
! rm *.zip *.csv
! wget bwg.140714.xyz:8000/toxic.zip 
! unzip toxic.zip 
! unzip train.csv 
! unzip test.csv
! unzip sample_submission.csv


In [0]:
import math
import re
import os
import timeit
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import logging
logging.basicConfig(format='[%(asctime)s %(levelname)-8s] %(message)s', level=logging.INFO, datefmt='%m-%d %H:%M:%S')

from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS

from keras.datasets import imdb
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api

class DeepLearning():
  """ A template for running CNN models"""
  def __init__(self, max_features=100000, max_sentence_len=200, embedding_dim=100):
    self.max_features = max_features
    self.max_sentence_len = max_sentence_len
    self.embedding_dim = embedding_dim # For using embedded vector
    self.filepath="weights_base.best.hdf5" # saving the best model weights 

  def load_data(self, train_file='train.csv', test_file='test.csv'):
      """ A task-dependent method that will load data and do simple preprocessing,
      @return: train_data, test_data, train_labels, test_labels
      Load data and  """
      train = pd.read_csv(train_file, engine='python',\
          encoding='utf-8', error_bad_lines=False)
      test = pd.read_csv(test_file, engine='python', \
          encoding='utf-8', error_bad_lines=False)
      logging.info('CSV data loaded')
      return train, test

  def exploring_data(self, train):
      '''Find patterns, informations'''
      pass 

  def tokenize_text(self, text_train, text_test):
      '''@para: max_features, the most commenly used words in data set
      @input are vector of text
      '''
      tokenizer = Tokenizer(num_words=self.max_features)
      text = pd.concat([text_train, text_test])
      tokenizer.fit_on_texts(text)

      sequence_train = tokenizer.texts_to_sequences(text_train)
      tokenized_train = pad_sequences(sequence_train, maxlen=self.max_sentence_len)
      logging.info('Train text tokeninzed')

      sequence_test = tokenizer.texts_to_sequences(text_test)
      tokenized_test = pad_sequences(sequence_test, maxlen=self.max_sentence_len)
      logging.info('Test text tokeninzed')
      return tokenized_train, tokenized_test, tokenizer
      

  def embed_glove_vector(self, word_index, model='glove-wiki-gigaword-100'):
      glove = api.load(model) # default: wikipedia 6B tokens, uncased
      zeros = [0] * self.embedding_dim
      matrix = np.zeros((self.max_features, self.embedding_dim))
      
      for word, i in word_index.items(): 
          if i >= self.max_features or word not in glove: continue # matrix[0] is zeros, that's also why >= is here
          matrix[i] = glove[word]

      logging.info('Glove embedding vector created')
      return matrix


  def tfidf_vectorized(self, text_train, text_test):
      """ Tokenize text with TfidfVectorizer()
          Parameters such as ngram_range, max_features requires fine-tuning 
          @input: text Series, not DataFrame
      """
      tv = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', \
                          analyzer='word', token_pattern=r'\w{1,}',  stop_words='english', \
                          ngram_range=(1, 1), max_features=self.max_features)
      # features_train = tv.fit_transform(train.comment_text)
      # return features_train, 0
      return (tv.fit_transform(text) for text in (text_train, test_train))


  def build_model(self, embedding_matrix=np.zeros(0)):
      dropout = 0.5
      model = Sequential()
      model.add(Embedding(self.max_features, self.embedding_dim, input_length=self.max_sentence_len))
      model.add(Flatten())

      model.add(Dense(64, activation='relu'))
      model.add(Dropout(dropout))
      model.add(Dense(64, activation='relu'))
      model.add(Dropout(dropout))
      
      model.add(Dense(6, activation='sigmoid'))

      if embedding_matrix.size > 0:
          model.layers[0].set_weights([embedding_matrix])
          model.layers[0].trainable = False
      logging.info(f'Model created')
      return model

  def build_lstm(self, embedding_matrix=np.zeros(0)):
      dropout = 0.4
      model = Sequential()
      model.add(Embedding(self.max_features, self.embedding_dim, input_length=self.max_sentence_len))
      model.add(Bidirectional(LSTM(64, dropout=dropout, recurrent_dropout=dropout)))

      model.add(Dense(64, activation='relu'))
      model.add(Dropout(dropout))
      model.add(Dense(64, activation='relu'))
      model.add(Dropout(dropout))
            
      model.add(Dense(6, activation='sigmoid'))

      if embedding_matrix.size > 0:
          model.layers[0].set_weights([embedding_matrix])
          model.layers[0].trainable = False
      logging.info(f'LSTM created')
      return model

  def build_gru(self, embedding_matrix=np.zeros(0)):
      dropout = 0.4
      model = Sequential()
      model.add(Embedding(self.max_features, self.embedding_dim, input_length=self.max_sentence_len))
      model.add(Bidirectional(LSTM(64, dropout=dropout, recurrent_dropout=dropout)))

      # model.add(Dense(256, activation='relu'))
      # model.add(Dropout(dropout))
      model.add(Dense(128, activation='relu'))
      model.add(Dropout(dropout))
      model.add(Dense(64, activation='relu'))
      model.add(Dropout(dropout))

      model.add(Dense(6, activation='sigmoid'))

      if embedding_matrix.size > 0:
          model.layers[0].set_weights([embedding_matrix])
          model.layers[0].trainable = False
      logging.info(f'GRU created')
      return model

  def run(self, model, x_train, y_train):
      checkpoint = ModelCheckpoint(self.filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
      early = EarlyStopping(monitor="val_acc", mode="max", patience=5)

      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
      X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=23)
      history = model.fit(X_tra, y_tra, epochs=30, batch_size=128, validation_data=(X_val, y_val), \
                          callbacks=[checkpoint, early], verbose=1)
      return model, history

  def display_history(self, history):
      acc = history['acc']
      val_acc = history.history['val_acc']
      loss = history.history['loss']
    
  def predict(self, y_test, labels, sub_file="sample_submission.csv"):
      res = pd.read_csv(sub_file)
      res[labels] = y_test
      res.to_csv('submission.csv', index=False)
      logging.info(f"Predictions were written to submission.csv")

  def describe_model(self, **dm):
    for k, v in dm.items():
      print(f"{k}: {v}")


time_start = timeit.default_timer()
dl = DeepLearning(max_features=200000, max_sentence_len=300, embedding_dim=300)
train, test = dl.load_data()
columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = train[columns].values

X_train, X_test, tokenizer = dl.tokenize_text(train["comment_text"].fillna("_na_"), test["comment_text"].fillna("_na_"))
embedding_matrix = dl.embed_glove_vector(tokenizer.word_index, 'word2vec-google-news-300')

model = dl.build_lstm(embedding_matrix)
# model = dl.build_gru()
model, history = dl.run(model, X_train, labels)
model.load_weights(dl.filepath)
y_test = model.predict([X_test], batch_size=1024, verbose=1)
dl.predict(y_test, columns)

time_stop = timeit.default_timer()
print(f'Program run for {time_stop - time_start} seconds')


In [0]:
# from google.colab import files
# files.download('submission.csv')
strategy="Tokenized on all text; use glove-twitter-200 "
dl.describe_model(model='Bidirectional LSTM 64 Dense 128-64', max_features=dl.max_features, embed_size=dl.embedding_dim, maxlen=dl.max_sentence_len, strategy=strategy)

! curl -X PUT --upload-file submission.csv ali.140714.xyz:8000
print('DONE uploading')


In [0]:
! cp weights_base.best.hdf5 best_098012.hdf5

# Results
* GRU_64 + Dense_64 + GloVe-Twitter-200 = 0.98166
* GRU_64 + Dense_128 + Dense_64 no embedding 
